Example 1
Source File: LDAExample.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel, LocalLDAModel}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import scopt.OptionParser
object LDAExample {
  case class Params(
      inputPath: String = null,
      outputPath: String = null,
      numTopics: Int = 10,
      maxIterations: Int = 10,
      optimizer: String = "online",
      maxResultSize: String = "1g")
  def main(args: Array[String]): Unit = {
    val defaultParams = Params()
    val parser = new OptionParser[Params]("LDA") {
	  head("LDA: an example app for LDA.")
        .text(s"optimizer, default: ${defaultParams.optimizer}")
        .action((x, c) => c.copy(optimizer = x))
        .text("max resultSize, default: ${defaultParams.maxResultSize}")
        .action((x, c) => c.copy(maxResultSize = x))
        .text(s"number of Topics, default: ${defaultParams.numTopics}")
        .action((x, c) => c.copy(numTopics = x))
        .text(s"number of max iterations, default: ${defaultParams.maxIterations}")
        .action((x, c) => c.copy(maxIterations = x))
        .text("Input paths")
        .action((x, c) => c.copy(inputPath = x))
        .text("outputPath paths")
        .action((x, c) => c.copy(outputPath = x))		

	parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
  def run(params: Params): Unit = {
    val conf = new SparkConf()
        .setAppName(s"LDA Example with $params")
        .set("spark.driver.maxResultSize", params.maxResultSize)
        .set("spark.shuffle.compress", "false")
        .set("", "")
        .set("spark.smartCompress", "false")
    val sc = new SparkContext(conf)

    val corpus: RDD[(Long, Vector)] = sc.objectFile(params.inputPath)
    // Cluster the documents into numTopics topics using LDA
    val ldaModel = new LDA().setK(params.numTopics).setMaxIterations(params.maxIterations).setOptimizer(params.optimizer).run(corpus)

    // Save and load model., params.outputPath)
    val savedModel = LocalLDAModel.load(sc, params.outputPath)

Example 2
Source File: MultilayerPerceptronClassifierSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.LogisticRegressionSuite._
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("XOR function learning as binary classification problem with two outputs.") {
    val dataFrame = sqlContext.createDataFrame(Seq(
        (Vectors.dense(0.0, 0.0), 0.0),
        (Vectors.dense(0.0, 1.0), 1.0),
        (Vectors.dense(1.0, 0.0), 1.0),
        (Vectors.dense(1.0, 1.0), 0.0))
    ).toDF("features", "label")
    val layers = Array[Int](2, 5, 2)
    val trainer = new MultilayerPerceptronClassifier()
    val model =
    val result = model.transform(dataFrame)
    val predictionAndLabels ="prediction", "label").collect()
    predictionAndLabels.foreach { case Row(p: Double, l: Double) =>
      assert(p == l)

  // TODO: implement a more rigorous test
  test("3 class classification with 2 hidden layers") {
    val nPoints = 1000

    // The following coefficients are taken from OneVsRestSuite.scala
    // they represent 3-class iris dataset
    val coefficients = Array(
      -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
      -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)

    val xMean = Array(5.843, 3.057, 3.758, 1.199)
    val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
    // the input seed is somewhat magic, to make this test pass
    val rdd = sc.parallelize(generateMultinomialLogisticInput(
      coefficients, xMean, xVariance, true, nPoints, 1), 2)
    val dataFrame = sqlContext.createDataFrame(rdd).toDF("label", "features")
    val numClasses = 3
    val numIterations = 100
    val layers = Array[Int](4, 5, 4, numClasses)
    val trainer = new MultilayerPerceptronClassifier()
      .setSeed(11L) // currently this seed is ignored
    val model =
    val numFeatures ="features").first().getAs[Vector](0).size
    assert(model.numFeatures === numFeatures)
    val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label")
      .map { case Row(p: Double, l: Double) => (p, l) }
    // train multinomial logistic regression
    val lr = new LogisticRegressionWithLBFGS()
    val lrModel =
    val lrPredictionAndLabels = lrModel.predict(
    // MLP's predictions should not differ a lot from LR's.
    val lrMetrics = new MulticlassMetrics(lrPredictionAndLabels)
    val mlpMetrics = new MulticlassMetrics(mlpPredictionAndLabels)
    assert(mlpMetrics.confusionMatrix ~== lrMetrics.confusionMatrix absTol 100)
Example 3
Source File: IDFSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

Example 4
Source File: HogHBaseCluster.scala    From hogzilla   with GNU General Public License v2.0 5 votes vote down vote up
package org.hogzilla.hbase

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.Vector
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Delete
import org.hogzilla.cluster.HogClusterMember

object HogHBaseCluster {

 def formatClusterTitle(clusterCentroid: List[(Long,Double)], clusterIdx:Int):String =
   val mainTitle = 
   "Group "+clusterIdx.toString+" - "+
   .filter({case (port,rate) =>
            rate > 4.999
   .map({case (port,rate) =>
        }).mkString(", ")
   val onePercentList=
   .filter({case (port,rate) =>
            .9999 < rate & rate < 5
     mainTitle+", "+{case (port,rate) =>
        }).mkString("(",", ",")"+"> 1%")
 def deleteCluster(clusterIdx:Int)=
     val del = new Delete(Bytes.toBytes(clusterIdx.toString))
 def deleteClusterMember(memberIP:String)=
     val del = new Delete(Bytes.toBytes(memberIP))
 def saveCluster(clusterIdx:Int, clusterCentroid:List[(Long,Double)], clusterSize: Long, members:Array[String]) = {
     val memberString = members.mkString(",")
     val put = new Put(Bytes.toBytes(clusterIdx.toString))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("title"), Bytes.toBytes(formatClusterTitle(clusterCentroid,clusterIdx)))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("size"), Bytes.toBytes(clusterSize.toString))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("centroid"), Bytes.toBytes(clusterCentroid.mkString("[",",","]")))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("members"), Bytes.toBytes(memberString))
 def saveClusterMember(clusterMember:HogClusterMember) = {
     val put = new Put(Bytes.toBytes(clusterMember.memberIP.toString))
     put.add(Bytes.toBytes("info"),   Bytes.toBytes("title"),      Bytes.toBytes(clusterMember.formatTitle))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("size"),       Bytes.toBytes(clusterMember.clusterSize.toString))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("centroid"),   Bytes.toBytes(clusterMember.centroid.mkString("[",",","]")))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("idx"),        Bytes.toBytes(clusterMember.clusterIdx.toString))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("ports"),      Bytes.toBytes("TCP: "+clusterMember.ports.mkString(""," ","")))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("frequencies"),Bytes.toBytes("TCP: "+
                                                                           .filter({case (port,freq) => clusterMember.ports.contains(port)})
                                                                           .map({case (port,freq) => port.toString+"="+
                                                                           .mkString(""," ","")
     put.add(Bytes.toBytes("member"), Bytes.toBytes("ip"),         Bytes.toBytes(clusterMember.memberIP))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("distance"),   Bytes.toBytes("%.2f".format(clusterMember.distance)))

Example 5
Source File: Embedding.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.story.linalg

import org.apache.spark.mllib.linalg.{Vector, Vectors}

case object DenseEmbedding extends Embedding {
  def embed(v: Vector): Vector = Vectors.dense(v.toArray)

object Embedding {

  val lowDimension = 64
  val mediumDimension = 256
  val highDimension = 1024
  val epsilon = 0.01

  def apply(embeddingName: String): Embedding = {
    embeddingName match {
      case IDENTITY_EMBEDDING => IdentityEmbedding
      case DENSE_EMBEDDING => DenseEmbedding
      case LOW_DIMENSIONAL_RI => new RandomIndexEmbedding(lowDimension, epsilon, seed = 0)
      case MEDIUM_DIMENSIONAL_RI => new RandomIndexEmbedding(mediumDimension, epsilon, seed = 0)
      case HIGH_DIMENSIONAL_RI => new RandomIndexEmbedding(highDimension, epsilon, seed = 0)
      case _ => throw new RuntimeException(s"unknown embedding name $embeddingName")
Example 6
package com.chapter16.SparkTesting

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  def calcDistance(sc: SparkContext, vPoints: RDD[Vector], centroid: Vector): Double = {

    // Broadcast centroid to all partitions
    val bcCentroid = sc.broadcast(centroid)

    // For each partition, calculate the sum of distances from centroid to each of the points in
    // that partition. Then, sum up the partial sums from all the partitions.

    val accmDistance = vPoints.mapPartitions{ points => {
      var sum = 0.0
      points.foreach { point => {
        sum += EuclideanVectorSpace.distance(point, bcCentroid.value)
    }}.reduce(_ + _)
Example 7
Source File: LogisticRegressionRecommender.scala    From wordpress-posts-recommender   with Apache License 2.0 5 votes vote down vote up
package wordpressworkshop

import{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame

case class LogisticRegressionRecommender(training: DataFrame) {

  val lr = new LogisticRegression()
  val paramMap = ParamMap(lr.maxIter -> 20)
                 .put(lr.regParam -> 0.01)
                 .put(lr.probabilityCol -> "probability")

  val model: LogisticRegressionModel =, paramMap)

  def metrics(testData: DataFrame) = {
    val predictionAndLabels: RDD[(Double, Double)] =
      model.transform(testData).map(row => row.getAs[Vector]("probability")(1) -> row.getAs[Double]("label"))

    new BinaryClassificationMetrics(predictionAndLabels)

  def likeScores(testData: DataFrame): RDD[(Long, Long, Double)] =
    .map(row => (row.getAs[Long]("userId"), row.getAs[Long]("postId"), row.getAs[Vector]("probability")(1)))
Example 8
Source File: Utils.scala    From awesome-recommendation-engine   with Apache License 2.0 5 votes vote down vote up
package com.databricks.apps.twitter_classifier

import org.apache.commons.cli.{Options, ParseException, PosixParser}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.feature.HashingTF
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder

object Utils {

  val numFeatures = 1000
  val tf = new HashingTF(numFeatures)

  val CONSUMER_KEY = "consumerKey"
  val CONSUMER_SECRET = "consumerSecret"
  val ACCESS_TOKEN = "accessToken"
  val ACCESS_TOKEN_SECRET = "accessTokenSecret"

  val THE_OPTIONS = {
    val options = new Options()
    options.addOption(CONSUMER_KEY, true, "Twitter OAuth Consumer Key")
    options.addOption(CONSUMER_SECRET, true, "Twitter OAuth Consumer Secret")
    options.addOption(ACCESS_TOKEN, true, "Twitter OAuth Access Token")
    options.addOption(ACCESS_TOKEN_SECRET, true, "Twitter OAuth Access Token Secret")

  def parseCommandLineWithTwitterCredentials(args: Array[String]) = {
    val parser = new PosixParser
    try {
      val cl = parser.parse(THE_OPTIONS, args)
      //System.setProperty("twitter4j.oauth.consumerKey", cl.getOptionValue(CONSUMER_KEY))
      //System.setProperty("twitter4j.oauth.consumerSecret", cl.getOptionValue(CONSUMER_SECRET))
      //System.setProperty("twitter4j.oauth.accessToken", cl.getOptionValue(ACCESS_TOKEN))
      //System.setProperty("twitter4j.oauth.accessTokenSecret", cl.getOptionValue(ACCESS_TOKEN_SECRET))

      System.setProperty("twitter4j.oauth.consumerKey", "jREUiik4pE9bKcBUYr5xsV7jt")
      System.setProperty("twitter4j.oauth.consumerSecret", "LIUbDpJzgoJ8gz3w3OgQFGcMnMLyjPi9S3uBmtEdaLGzUBqkM9")
      System.setProperty("twitter4j.oauth.accessToken", "453844423-3P6XqQ8hXWY1K47gEL1LU9lRg9kcrzfEXDvVTMZM")
      System.setProperty("twitter4j.oauth.accessTokenSecret", "vrDBfnE1ya425mYIjM80OH8HmyYOQ3RUotk3t8gdFy6Yy")
    } catch {
      case e: ParseException =>
        System.err.println("Parsing failed.  Reason: " + e.getMessage)

  def getAuth = {
    Some(new OAuthAuthorization(new ConfigurationBuilder().build()))

  def featurize(s: String): Vector = {

  object IntParam {
    def unapply(str: String): Option[Int] = {
      try {
      } catch {
        case e: NumberFormatException => None
Example 9
Source File: Gradient.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}

  def compute(
    iter: Iterator[(Double, Vector)],
    weights: Vector,
    cumGradient: Vector): (Long, Double) = {
    var loss = 0D
    var count = 0L
    iter.foreach { t =>
      loss += compute(t._2, t._1, weights, cumGradient)
      count += 1
    (count, loss)
Example 10
Source File: LDADataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import scala.collection.mutable.{HashMap => MHashMap}
import org.apache.spark.rdd.RDD

  def generateLDARDD(
    sc: SparkContext,
    numDocs: Long,
    numVocab: Int,
    docLenMin: Int,
    docLenMax: Int,
    numParts: Int = 3,
    seed: Long = System.currentTimeMillis()): RDD[(Long, Vector)] = {
    val data = sc.parallelize(0L until numDocs, numParts).mapPartitionsWithIndex { 
      (idx, part) =>
        val rng = new Random(seed ^ idx) { case docIndex =>
          var currentSize = 0
          val entries = MHashMap[Int, Int]()
          val docLength = rng.nextInt(docLenMax - docLenMin + 1) + docLenMin
          while (currentSize < docLength) {
            val index = rng.nextInt(numVocab)
            entries(index) = entries.getOrElse(index, 0) + 1
            currentSize += 1

          val iter = => (v._1, v._2.toDouble))
          (docIndex, Vectors.sparse(numVocab, iter))

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("LDADataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numDocs: Long = 500L
    var numVocab: Int = 1000
    var docLenMin: Int = 50
    var docLenMax: Int = 10000
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 5) {
      outputPath = args(0)
      numDocs = args(1).toInt
      numVocab = args(2).toInt
      docLenMin = args(3).toInt
      docLenMax = args(4).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Documents: $numDocs")
      println(s"Vocabulary size: $numVocab")
    } else {

    val data = generateLDARDD(sc, numDocs, numVocab, docLenMin, docLenMax, numPartitions)


Example 11
Source File: RatingDataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.mllib.random._
import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.mllib.linalg.{Vectors, Vector}

object RatingDataGenerator {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("RatingDataGeneration")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numUsers: Int = 100
    var numProducts: Int = 100
    var sparsity: Double = 0.05
    var implicitPrefs: Boolean = false
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 5) {
      outputPath = args(0)
      numUsers = args(1).toInt
      numProducts = args(2).toInt
      sparsity = args(3).toDouble
      implicitPrefs = args(4).toBoolean

      println(s"Output Path: $outputPath")
      println(s"Num of Users: $numUsers")
      println(s"Num of Products: $numProducts")
      println(s"Sparsity: $sparsity")
      println(s"Implicit Prefs: $implicitPrefs")
    } else {

    val rawData: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, numUsers, numProducts, numPartitions)
    val rng = new java.util.Random()
    val data ={v =>
      val a = Array.fill[Double](v.size)(0.0)
      v.foreachActive{(i,vi) =>
         if(rng.nextDouble <= sparsity){
           a(i) = vi


Example 12
Source File: ProbabilisticClassifierSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}

final class TestProbabilisticClassificationModel(
    override val uid: String,
    override val numFeatures: Int,
    override val numClasses: Int)
  extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] {

  override def copy(extra: this.type = defaultCopy(extra)

  override protected def predictRaw(input: Vector): Vector = {

  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {

  def friendlyPredict(input: Vector): Double = {

class ProbabilisticClassifierSuite extends SparkFunSuite {

  test("test thresholding") {
    val thresholds = Array(0.5, 0.2)
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
    assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 1.0))) === 1.0)
    assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 0.2))) === 0.0)

  test("test thresholding not required") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
    assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 2.0))) === 1.0)

object ProbabilisticClassifierSuite {

  val allParamSettings: Map[String, Any] = ClassifierSuite.allParamSettings ++ Map(
    "probabilityCol" -> "myProbability",
    "thresholds" -> Array(0.4, 0.6)

Example 13
Source File: SVDDataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{Vectors,Vector}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.random.RandomRDDs

object SVDDataGenerator {
  def generateDistributedRowMatrix(
      sc: SparkContext,
      m: Long,
      n: Int,
      numPartitions: Int,
      seed: Long = System.currentTimeMillis()): RDD[Vector] = {
    val data: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, m, n, numPartitions, seed)

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("SVDDataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numExamples: Int = 200000
    var numFeatures: Int = 20
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 3) {
      outputPath = args(0)
      numExamples = args(1).toInt
      numFeatures = args(2).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Examples: $numExamples")
      println(s"Num of Features: $numFeatures")
    } else {
        s"Usage: $SVDDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"

    val data = generateDistributedRowMatrix(sc, numExamples, numFeatures, numPartitions)


Example 14
Source File: SVDExample.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

import scopt.OptionParser

object SVDExample {

  case class Params(
    numFeatures: Int = 0,
    numSingularValues: Int = 0,
    computeU: Boolean = true,
    maxResultSize: String = "1g",
    dataPath: String = null

  def main(args: Array[String]): Unit = {
    val defaultParams = Params()
    val parser = new OptionParser[Params]("SVD") {
      head("SVD: an example of SVD for matrix decomposition.")
        .text(s"numFeatures, default: ${defaultParams.numFeatures}")
        .action((x,c) => c.copy(numFeatures = x))
        .text(s"numSingularValues, default: ${defaultParams.numSingularValues}")
        .action((x,c) => c.copy(numSingularValues = x))
        .text(s"computeU, default: ${defaultParams.computeU}")
        .action((x,c) => c.copy(computeU = x))
        .text(s"maxResultSize, default: ${defaultParams.maxResultSize}")
        .action((x,c) => c.copy(maxResultSize = x))
        .text("data path of SVD")
        .action((x,c) => c.copy(dataPath = x))
     parser.parse(args, defaultParams) match {
       case Some(params) => run(params)
       case _ => sys.exit(1)

  def run(params: Params): Unit = {

    val conf = new SparkConf()
        .setAppName(s"SVD with $params")
        .set("spark.driver.maxResultSize", params.maxResultSize)
        .set("spark.shuffle.compress", "false")
        .set("", "")
        .set("spark.smartCompress", "false")
    val sc = new SparkContext(conf)

    val dataPath = params.dataPath
    val numFeatures = params.numFeatures
    val numSingularValues = params.numSingularValues
    val computeU = params.computeU

    val data: RDD[Vector] = sc.objectFile(dataPath) 
    val mat: RowMatrix = new RowMatrix(data)

    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(numSingularValues, computeU)
    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V  // The V factor is a local dense matrix.

Example 15
Source File: Featurizer.scala    From lexrank-summarizer   with MIT License 5 votes vote down vote up
package io.github.karlhigley.lexrank

import org.apache.spark.SparkContext

import org.apache.spark.rdd.RDD

import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.{SparseVector, Vector}

case class SentenceFeatures(id: Long, docId: String, features: SparseVector)

class Featurizer(numStopwords: Int = 0) extends Serializable {
  private val hashingTF = new HashingTF()
  private val byIDF = Ordering[Double].on[(Int,Double)](_._2)

  def apply(tokens: RDD[SentenceTokens]) : RDD[SentenceFeatures] = {
    val idf = new IDF(minDocFreq = 2)

    val termFrequencies = => {
        (, t.docId, hashingTF.transform(t.tokens))
    val idfModel ={ case (_, _, tf) => tf }))

    val stopwordIndices = identifyStopwords(idfModel.idf.toSparse, numStopwords)

        case (id, docId, tf) =>
          val tfidf = idfModel.transform(tf).toSparse
          val features = removeStopwords(tfidf, stopwordIndices)
          SentenceFeatures(id, docId, features)
      .filter(_.features.indices.size > 0)

  def indexOf(token: String): Int = {

  private def identifyStopwords(idf: SparseVector, numStopwords: Int) = {

  private def removeStopwords(tf: SparseVector, stopwordIndices: Array[Int]) = {
    val (indices, values) =
          .filter(p => !stopwordIndices.contains(p._1))
    new SparseVector(tf.size, indices.toArray, values.toArray)

  private def featureTuples(featureVector: SparseVector) = {
Example 16
Source File: MlLibOnKudu.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up

import{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)"org.apache.kudu.spark.kudu").load.

    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip =

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
Example 17
Source File: VectorDistance.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import com.github.fommil.netlib.BLAS.{getInstance => blas}
import org.apache.spark.mllib.linalg.Vector

object Cosine extends VectorDistance {

  def apply(vecA: Vector, vecB: Vector): Double = {
    val v1 =
    val v2 =
    apply(v1, v2)

  def apply(vecA: Array[Float], vecB: Array[Float]): Double = {
    val n = vecA.length
    val norm1 = blas.snrm2(n, vecA, 1)
    val norm2 = blas.snrm2(n, vecB, 1)
    if (norm1 == 0 || norm2 == 0) return 0.0
    blas.sdot(n, vecA, 1, vecB, 1) / norm1 / norm2

Example 18
Source File: NearestNeighboursTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class NearestNeighboursTest
  extends FunSuite
  with SparkLocalContext
  with Matchers {

  def denseVector(input: Double*): Vector = {

  test("nearest neighbours cosine") {
    val vecA = denseVector(1.0, 0.0)
    val vecB = denseVector(0.0, 1.0)
    val vecC = denseVector(-1.0, 0.0)
    val vecD = denseVector(1.0, 0.0)

    val rows = Seq(
      IndexedRow(0, vecA),
      IndexedRow(1, vecB),
      IndexedRow(2, vecC),
      IndexedRow(3, vecD)
    val indexedMatrix = new IndexedRowMatrix(sc.parallelize(rows))

    val nearestNeighbour = new NearestNeighbours(Cosine, 0.0, 1.0)
    val got = nearestNeighbour.join(indexedMatrix)

    val expected = Seq(
      MatrixEntry(0, 1, 0.0),
      MatrixEntry(0, 3, 1.0),
      MatrixEntry(1, 2, 0.0),
      MatrixEntry(1, 3, 0.0)
    val gotEntries = got.entries.collect().toSeq
    gotEntries should be(expected)

Example 19
Source File: QueryHammingTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import com.soundcloud.TestHelper
import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class QueryHammingTest
  extends FunSuite
    with SparkLocalContext
    with Matchers
    with TestHelper {

  def denseVector(input: Double*): Vector = {

  val queryVectorA = denseVector(1.0, 1.0)
  val queryVectorB = denseVector(-1.0, 1.0)

  val catalogVectorA = denseVector(1.0, 1.0)
  val catalogVectorB = denseVector(-1.0, 1.0)
  val catalogVectorC = denseVector(-1.0, 0.5)
  val catalogVectorD = denseVector(1.0, 0.5)

  val queryRows = Seq(
    IndexedRow(0, queryVectorA),
    IndexedRow(1, queryVectorB)

  val catalogRows = Seq(
    IndexedRow(0, catalogVectorA),
    IndexedRow(1, catalogVectorB),
    IndexedRow(2, catalogVectorC),
    IndexedRow(3, catalogVectorD)

  val expected = Array(
    MatrixEntry(0, 0, Cosine(queryVectorA, catalogVectorA)),
    MatrixEntry(0, 3, Cosine(queryVectorA, catalogVectorD)),
    MatrixEntry(1, 1, Cosine(queryVectorB, catalogVectorB)),
    MatrixEntry(1, 2, Cosine(queryVectorB, catalogVectorC))

  test("broadcast catalog") {
    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryHamming(0.1, 10000, 2, true)
    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect

    implicit val equality = new MatrixEquality(0.02)
    got.sortBy(t => (t.i, t.j)) should equal(expected)

  test("broadcast query") {
    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryHamming(0.1, 10000, 2, false)
    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect

    implicit val equality = new MatrixEquality(0.02)
    got.sortBy(t => (t.i, t.j)) should equal(expected)

Example 20
Source File: QueryNearestNeighboursTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class QueryNearestNeighboursTest
  extends FunSuite
    with SparkLocalContext
    with Matchers {

  def denseVector(input: Double*): Vector = {

  test("nearest neighbours cosine") {
    val queryVectorA = denseVector(1.0, 1.0)
    val queryVectorB = denseVector(-1.0, 1.0)

    val catalogVectorA = denseVector(1.0, 1.0)
    val catalogVectorB = denseVector(-1.0, 1.0)
    val catalogVectorC = denseVector(-1.0, 0.5)
    val catalogVectorD = denseVector(1.0, 0.5)

    val queryRows = Seq(
      IndexedRow(0, queryVectorA),
      IndexedRow(1, queryVectorB)

    val catalogRows = Seq(
      IndexedRow(0, catalogVectorA),
      IndexedRow(1, catalogVectorB),
      IndexedRow(2, catalogVectorC),
      IndexedRow(3, catalogVectorD)

    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryNearestNeighbours(Cosine, 0.4, 1.0, 1.0)
    val expected = Seq(
      MatrixEntry(0, 0, Cosine(queryVectorA, catalogVectorA)),
      MatrixEntry(0, 3, Cosine(queryVectorA, catalogVectorD)),
      MatrixEntry(1, 1, Cosine(queryVectorB, catalogVectorB)),
      MatrixEntry(1, 2, Cosine(queryVectorB, catalogVectorC))

    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect
    got should be(expected)
Example 21
Source File: VectorDistanceTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class  VectorDistanceTest
  extends FunSuite
  with SparkLocalContext
  with Matchers {

  def denseVector(input: Double*): Vector = {

  test("cosine similarity") {
    val vecA = denseVector(1.0, 0.0)
    val vecB = denseVector(0.0, 1.0)
    val vecC = denseVector(-1.0, 0.0)
    val vecD = denseVector(1.0, 0.0)

    val perpendicular = Cosine(vecA, vecB)
    perpendicular should be(0.0)

    val opposite = Cosine(vecA, vecC)
    opposite should be(-1.0)

    val same = Cosine(vecA, vecD)
    same should be(1.0)

  test("similarities") {
    val vec1 = Vectors.dense(1.0, 2.0, 3.0)
    val vec2 = Vectors.dense(1.0, 2.0, 4.0)
    val vec3 = Vectors.dense(7.0, 7.0, 9.0)

    Cosine(vec1, vec2) should be >= Cosine(vec1, vec3)
    Cosine(vec2, vec1) should be >= Cosine(vec2, vec3)
    Cosine(vec3, vec1) should be >= Cosine(vec3, vec2)
Example 22
Source File: LabeledPoint.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException

  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
Example 23
Source File: BisectingKMeansExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

// scalastyle:off println
// $example on$
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
// $example off$
import org.apache.spark.{SparkConf, SparkContext}

object BisectingKMeansExample {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
    val sc = new SparkContext(sparkConf)

    // $example on$
    // Loads and parses data
    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()

    // Clustering the data into 6 clusters by BisectingKMeans.
    val bkm = new BisectingKMeans().setK(6)
    val model =

    // Show the compute cost and the cluster centers
    println(s"Compute Cost: ${model.computeCost(data)}")
    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
      println(s"Cluster Center ${idx}: ${center}")
    // $example off$

// scalastyle:on println 
Example 24
Source File: Normalizer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.annotation.{Since, Experimental}
import{DoubleParam, ParamValidators}
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.types.DataType

  def setP(value: Double): this.type = set(p, value)

  override protected def createTransformFunc: Vector => Vector = {
    val normalizer = new feature.Normalizer($(p))

  override protected def outputDataType: DataType = new VectorUDT()

object Normalizer extends DefaultParamsReadable[Normalizer] {

  override def load(path: String): Normalizer = super.load(path)
Example 25
Source File: DCT.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.sql.types.DataType

  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")

  override protected def outputDataType: DataType = new VectorUDT

object DCT extends DefaultParamsReadable[DCT] {

  override def load(path: String): DCT = super.load(path)
Example 26
Source File: BinaryClassificationEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.annotation.{Experimental, Since}
import{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.DoubleType

  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "areaUnderROC")

  override def evaluate(dataset: DataFrame): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT)
    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)

    // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2.
    val scoreAndLabels =$(rawPredictionCol), $(labelCol))
      .map { case Row(rawPrediction: Vector, label: Double) =>
        (rawPrediction(1), label)
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    val metric = $(metricName) match {
      case "areaUnderROC" => metrics.areaUnderROC()
      case "areaUnderPR" => metrics.areaUnderPR()

  override def isLargerBetter: Boolean = $(metricName) match {
    case "areaUnderROC" => true
    case "areaUnderPR" => true

  override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra)

object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] {

  override def load(path: String): BinaryClassificationEvaluator = super.load(path)
Example 27
Source File: LibSVMRelation.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up


import org.apache.spark.Logging
import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}

class DefaultSource extends RelationProvider with DataSourceRegister {

  override def shortName(): String = "libsvm"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String])
    : BaseRelation = {
    val path = parameters.getOrElse("path",
      throw new IllegalArgumentException("'path' must be specified"))
    val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt
    val vectorType = parameters.getOrElse("vectorType", "sparse")
    new LibSVMRelation(path, numFeatures, vectorType)(sqlContext)
Example 28
Source File: Normalizer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}

  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.size
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.size
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.

Example 29
Source File: Word2VecModelWrapper.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{ArrayList => JArrayList, List => JList, Map => JMap}
import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = {

  def findSynonyms(word: String, num: Int): JList[Object] = {
    val vec = transform(word)
    findSynonyms(vec, num)

  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    val result = model.findSynonyms(vector, num)
    val similarity = Vectors.dense(
    val words =
    List(words, similarity).map(_.asInstanceOf[Object]).asJava

  def getVectors: JMap[String, JList[Float]] = {{case (k, v) => (k, v.toList.asJava)}).asJava

  def save(sc: SparkContext, path: String): Unit =, path)
Example 30
Source File: PearsonCorrelation.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.Logging
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
    val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]]
    val n = cov.cols

    // Compute the standard deviation on the diagonals first
    var i = 0
    while (i < n) {
      // TODO remove once covariance numerical issue resolved.
      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
      i +=1

    // Loop through columns since cov is column major
    var j = 0
    var sigma = 0.0
    var containNaN = false
    while (j < n) {
      sigma = cov(j, j)
      i = 0
      while (i < j) {
        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
          containNaN = true
        } else {
          cov(i, j) / (sigma * cov(i, i))
        cov(i, j) = corr
        cov(j, i) = corr
        i += 1
      j += 1

    // put 1.0 on the diagonals
    i = 0
    while (i < n) {
      cov(i, i) = 1.0
      i +=1

    if (containNaN) {
      logWarning("Pearson correlation matrix contains NaN values.")


  private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = {
    math.abs(value) <= threshold
Example 31
Source File: SpearmanCorrelation.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Logging
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD

  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) =>
        ((j, v), uid)
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = { uid =>
          (uid, (preCol, averageRank))
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
        } else {
          cachedUids += uid
      } ++ flush()
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
Example 32
Source File: GLMClassificationModel.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.classification.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SQLContext}

    def loadData(sc: SparkContext, path: String, modelClass: String): Data = {
      val datapath = Loader.dataPath(path)
      val sqlContext = SQLContext.getOrCreate(sc)
      val dataRDD =
      val dataArray ="weights", "intercept", "threshold").take(1)
      assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath")
      val data = dataArray(0)
      assert(data.size == 3, s"Unable to load $modelClass data from: $datapath")
      val (weights, intercept) = data match {
        case Row(weights: Vector, intercept: Double, _) =>
          (weights, intercept)
      val threshold = if (data.isNullAt(2)) {
      } else {
      Data(weights, intercept, threshold)

Example 33
Source File: SimpleTextClassificationPipeline.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println

import scala.beans.BeanInfo

import org.apache.spark.{SparkConf, SparkContext}
import{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}

case class LabeledDocument(id: Long, text: String, label: Double)

case class Document(id: Long, text: String)

object SimpleTextClassificationPipeline {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    // Prepare training documents, which are labeled.
    val training = sc.parallelize(Seq(
      LabeledDocument(0L, "a b c d e spark", 1.0),
      LabeledDocument(1L, "b d", 0.0),
      LabeledDocument(2L, "spark f g h", 1.0),
      LabeledDocument(3L, "hadoop mapreduce", 0.0)))

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    val tokenizer = new Tokenizer()
    val hashingTF = new HashingTF()
    val lr = new LogisticRegression()
    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, lr))

    // Fit the pipeline to training documents.
    val model =

    // Prepare test documents, which are unlabeled.
    val test = sc.parallelize(Seq(
      Document(4L, "spark i j k"),
      Document(5L, "l m n"),
      Document(6L, "spark hadoop spark"),
      Document(7L, "apache hadoop")))

    // Make predictions on test documents.
      .select("id", "text", "probability", "prediction")
      .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
        println(s"($id, $text) --> prob=$prob, prediction=$prediction")

// scalastyle:on println 
Example 34
Source File: GLMRegressionModel.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

    def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = {
      val datapath = Loader.dataPath(path)
      val sqlContext = SQLContext.getOrCreate(sc)
      val dataRDD =
      val dataArray ="weights", "intercept").take(1)
      assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath")
      val data = dataArray(0)
      assert(data.size == 2, s"Unable to load $modelClass data from: $datapath")
      data match {
        case Row(weights: Vector, intercept: Double) =>
          assert(weights.size == numFeatures, s"Expected $numFeatures features, but" +
            s" found ${weights.size} features when loading $modelClass weights from $datapath")
          Data(weights, intercept)

Example 35
Source File: OneHotEncoderSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import{AttributeGroup, BinaryAttribute, NominalAttribute}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col

class OneHotEncoderSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  def stringIndexed(): DataFrame = {
    val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
    val df = sqlContext.createDataFrame(data).toDF("id", "label")
    val indexer = new StringIndexer()

  test("params") {
    ParamsSuite.checkParams(new OneHotEncoder)

  test("OneHotEncoder dropLast = false") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
    val encoded = encoder.transform(transformed)

    val output ="id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1), vec(2))
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
      (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
    assert(output === expected)

  test("OneHotEncoder dropLast = true") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
    val encoded = encoder.transform(transformed)

    val output ="id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1))
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0),
      (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0))
    assert(output === expected)

  test("input column with ML attribute") {
    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size")
      .select(col("size").as("size", attr.toMetadata()))
    val encoder = new OneHotEncoder()
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))

  test("input column without ML attribute") {
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index")
    val encoder = new OneHotEncoder()
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1))

  test("read/write") {
    val t = new OneHotEncoder()
Example 36
Source File: ChiSqSelectorSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{Row, SQLContext}

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  test("Test Chi-Square selector") {
    val sqlContext = SQLContext.getOrCreate(sc)
    import sqlContext.implicits._

    val data = Seq(
      LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
      LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))

    val preFilteredData = Seq(

    val df = sc.parallelize(
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    val model = new ChiSqSelector()
      .setOutputCol("filtered")"filtered", "preFilteredData").collect().foreach {
      case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 ~== vec2 absTol 1e-1)

  test("ChiSqSelector read/write") {
    val t = new ChiSqSelector()

  test("ChiSqSelectorModel read/write") {
    val oldModel = new feature.ChiSqSelectorModel(Array(1, 3))
    val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel)
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.selectedFeatures === instance.selectedFeatures)
Example 37
Source File: DCTSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)

  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)

  test("read/write") {
    val t = new DCT()

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true)
    } else {
      (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true)
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = sqlContext.createDataFrame(Seq(
      DCTTestData(data, expectedResult)

    val transformer = new DCT()

      .select("resultVec", "wantedVec")
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
Example 38
Source File: MinMaxScalerSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SQLContext}

class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("MinMaxScaler fit basic case") {
    val sqlContext = new SQLContext(sc)

    val data = Array(
      Vectors.dense(1, 0, Long.MinValue),
      Vectors.dense(2, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(3, Long.MaxValue)),
      Vectors.sparse(3, Array(0), Array(1.5)))

    val expected: Array[Vector] = Array(
      Vectors.dense(-5, 0, -5),
      Vectors.dense(0, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(5, 5)),
      Vectors.sparse(3, Array(0), Array(-2.5)))

    val df = sqlContext.createDataFrame("features", "expected")
    val scaler = new MinMaxScaler()

    val model =
    model.transform(df).select("expected", "scaled").collect()
      .foreach { case Row(vector1: Vector, vector2: Vector) =>
        assert(vector1.equals(vector2), "Transformed vector is different with expected.")

    // copied model must have the same parent.

  test("MinMaxScaler arguments max must be larger than min") {
    withClue("arguments max must be larger than min") {
      intercept[IllegalArgumentException] {
        val scaler = new MinMaxScaler().setMin(10).setMax(0)
      intercept[IllegalArgumentException] {
        val scaler = new MinMaxScaler().setMin(0).setMax(0)

  test("MinMaxScaler read/write") {
    val t = new MinMaxScaler()

  test("MinMaxScalerModel read/write") {
    val instance = new MinMaxScalerModel(
        "myMinMaxScalerModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0))
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.originalMin === instance.originalMin)
    assert(newInstance.originalMax === instance.originalMax)
Example 39
Source File: PolynomialExpansionSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.scalatest.exceptions.TestFailedException

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class PolynomialExpansionSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new PolynomialExpansion)

  test("Polynomial expansion with default parameter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())

    val twoDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)),
      Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29),
      Vectors.dense(new Array[Double](9)),
      Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
      Vectors.sparse(9, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()

    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)

  test("Polynomial expansion with setter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())

    val threeDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8),
        Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)),
      Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17),
      Vectors.dense(new Array[Double](19)),
      Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8,
        -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0),
      Vectors.sparse(19, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()

    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)

  test("read/write") {
    val t = new PolynomialExpansion()
Example 40
Source File: IDFSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { {
      case data: DenseVector =>
        val res = { case (x, y) => x * y }
      case data: SparseVector =>
        val res = { case (id, value) =>
          (id, value * model(id))
        Vectors.sparse(data.size, res)

  test("params") {
    ParamsSuite.checkParams(new IDF)
    val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0)))

  test("compute IDF with default parameter") {
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((numOfData + 1.0) / (x + 1.0))
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame("features", "expected")

    val idfModel = new IDF()

    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")

  test("compute IDF with setter") {
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame("features", "expected")

    val idfModel = new IDF()

    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")

  test("IDF read/write") {
    val t = new IDF()

  test("IDFModel read/write") {
    val instance = new IDFModel("myIDFModel", new OldIDFModel(Vectors.dense(1.0, 2.0)))
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.idf === instance.idf)
Example 41
Source File: HashingTFSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.Utils

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new HashingTF)

  test("hashingTF") {
    val df = sqlContext.createDataFrame(Seq(
      (0, "a a b b c d".split(" ").toSeq)
    )).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
    val output = hashingTF.transform(df)
    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
    require(attrGroup.numAttributes === Some(n))
    val features ="features").first().getAs[Vector](0)
    // Assume perfect hash on "a", "b", "c", and "d".
    def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n)
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
    assert(features ~== expected absTol 1e-14)

  test("read/write") {
    val t = new HashingTF()
Example 42
Source File: RandomForestSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import{ContinuousSplit, DecisionTreeModel, LeafNode, Node}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.tree.impurity.GiniCalculator
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.collection.OpenHashMap

    val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0))
    val left = new LeafNode(0.0, leftImp.calculate(), leftImp)

    val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0))
    val right = new LeafNode(2.0, rightImp.calculate(), rightImp)

    val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5))
    val parentImp = parent.impurityStats

    val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0))
    val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp)

    val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0))
    val grandImp = grandParent.impurityStats

    // Test feature importance computed at different subtrees.
    def testNode(node: Node, expected: Map[Int, Double]): Unit = {
      val map = new OpenHashMap[Int, Double]()
      RandomForest.computeFeatureImportance(node, map)
      assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01)

    // Leaf node
    testNode(left, Map.empty[Int, Double])

    // Internal node with 2 leaf children
    val feature0importance = parentImp.calculate() * parentImp.count -
      (leftImp.calculate() * leftImp.count + rightImp.calculate() * rightImp.count)
    testNode(parent, Map(0 -> feature0importance))

    // Full tree
    val feature1importance = grandImp.calculate() * grandImp.count -
      (left2Imp.calculate() * left2Imp.count + parentImp.calculate() * parentImp.count)
    testNode(grandParent, Map(0 -> feature0importance, 1 -> feature1importance))

    // Forest consisting of (full tree) + (internal node with 2 leafs)
    val trees = Array(parent, grandParent).map { root =>
      new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3)
    val importances: Vector = RandomForest.featureImportances(trees, 2)
    val tree2norm = feature0importance + feature1importance
    val expected = Vectors.dense((1.0 + feature0importance / tree2norm) / 2.0,
      (feature1importance / tree2norm) / 2.0)
    assert(importances ~== expected relTol 0.01)

  test("normalizeMapValues") {
    val map = new OpenHashMap[Int, Double]()
    map(0) = 1.0
    map(2) = 2.0
    val expected = Map(0 -> 1.0 / 3.0, 2 -> 2.0 / 3.0)
    assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01)


private object RandomForestSuite {

  def mapToVec(map: Map[Int, Double]): Vector = {
    val size = (map.keys.toSeq :+ 0).max + 1
    val (indices, values) = map.toSeq.sortBy(_._1).unzip
    Vectors.sparse(size, indices.toArray, values.toArray)
Example 43
Source File: SVDExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object SVDExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SVDExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 5 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V  // The V factor is a local dense matrix.
    // $example off$
    val collect = U.rows.collect()
    println("U factor is:")
    collect.foreach { vector => println(vector) }
    println(s"Singular values are: $s")
    println(s"V factor is:\n$V")
// scalastyle:on println 
Example 44
Source File: BisectingKMeansExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

// scalastyle:off println
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
// $example off$

object BisectingKMeansExample {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
    val sc = new SparkContext(sparkConf)

    // $example on$
    // Loads and parses data
    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()

    // Clustering the data into 6 clusters by BisectingKMeans.
    val bkm = new BisectingKMeans().setK(6)
    val model =

    // Show the compute cost and the cluster centers
    println(s"Compute Cost: ${model.computeCost(data)}")
    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
      println(s"Cluster Center ${idx}: ${center}")
    // $example off$

// scalastyle:on println 
Example 45
Source File: HashingTF.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import java.lang.{Iterable => JavaIterable}

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.unsafe.hash.Murmur3_x86_32._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.Utils

  private[spark] def murmur3Hash(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = UTF8String.fromString(s)
        hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed)
      case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")
Example 46
Source File: Normalizer.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}

  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.

Example 47
Source File: KMeansModel.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.clustering

import scala.collection.JavaConverters._

import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.pmml.PMMLExportable
import org.apache.spark.mllib.util.{Loader, Saveable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}

  def computeCost(data: RDD[Vector]): Double = {
    val centersWithNorm = clusterCentersWithNorm
    val bcCentersWithNorm = data.context.broadcast(centersWithNorm) => KMeans.pointCost(bcCentersWithNorm.value, new VectorWithNorm(p))).sum()

  private def clusterCentersWithNorm: Iterable[VectorWithNorm] = VectorWithNorm(_))

  override def save(sc: SparkContext, path: String): Unit = {, this, path)

  override protected def formatVersion: String = "1.0"

object KMeansModel extends Loader[KMeansModel] {

  override def load(sc: SparkContext, path: String): KMeansModel = {
    KMeansModel.SaveLoadV1_0.load(sc, path)

  private case class Cluster(id: Int, point: Vector)

  private object Cluster {
    def apply(r: Row): Cluster = {
      Cluster(r.getInt(0), r.getAs[Vector](1))

  object SaveLoadV1_0 {

    private val thisFormatVersion = "1.0"

    val thisClassName = "org.apache.spark.mllib.clustering.KMeansModel"

    def save(sc: SparkContext, model: KMeansModel, path: String): Unit = {
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val metadata = compact(render(
        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ ("k" -> model.k)))
      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
      val dataRDD = sc.parallelize(model.clusterCenters.zipWithIndex).map { case (point, id) =>
        Cluster(id, point)

    def load(sc: SparkContext, path: String): KMeansModel = {
      implicit val formats = DefaultFormats
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val (className, formatVersion, metadata) = Loader.loadMetadata(sc, path)
      assert(className == thisClassName)
      assert(formatVersion == thisFormatVersion)
      val k = (metadata \ "k").extract[Int]
      val centroids =
      val localCentroids =
      assert(k == localCentroids.length)
      new KMeansModel(localCentroids.sortBy(
Example 48
Source File: GaussianMixtureModelWrapper.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import scala.collection.JavaConverters

import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  val gaussians: Array[Byte] = {
    val modelGaussians = { gaussian =>
      Array[Any](, gaussian.sigma)

  def predictSoft(point: Vector): Vector = {

  def save(sc: SparkContext, path: String): Unit =, path)
Example 49
Source File: Word2VecModelWrapper.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(
    val words =
    List(words, similarity).map(_.asInstanceOf[Object]).asJava

  def getVectors: JMap[String, JList[Float]] = { { case (k, v) =>
      (k, v.toList.asJava)

  def save(sc: SparkContext, path: String): Unit =, path)
Example 50
Source File: MultivariateGaussian.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.distribution

import breeze.linalg.{diag, eigSym, max, DenseMatrix => DBM, DenseVector => DBV, Vector => BV}

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
import org.apache.spark.mllib.util.MLUtils

  private def calculateCovarianceConstants: (DBM[Double], Double) = {
    val eigSym.EigSym(d, u) = eigSym(sigma.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t

    // For numerical stability, values are considered to be non-zero only if they exceed tol.
    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
    val tol = MLUtils.EPSILON * max(d) * d.length

    try {
      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum

      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
      // by inverting the square root of all non-zero values
      val pinvS = diag(new DBV( => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))

      (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
    } catch {
      case uex: UnsupportedOperationException =>
        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
Example 51
Source File: PearsonCorrelation.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
    val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]]
    val n = cov.cols

    // Compute the standard deviation on the diagonals first
    var i = 0
    while (i < n) {
      // TODO remove once covariance numerical issue resolved.
      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
      i +=1

    // Loop through columns since cov is column major
    var j = 0
    var sigma = 0.0
    var containNaN = false
    while (j < n) {
      sigma = cov(j, j)
      i = 0
      while (i < j) {
        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
          containNaN = true
        } else {
          cov(i, j) / (sigma * cov(i, i))
        cov(i, j) = corr
        cov(j, i) = corr
        i += 1
      j += 1

    // put 1.0 on the diagonals
    i = 0
    while (i < n) {
      cov(i, i) = 1.0
      i +=1

    if (containNaN) {
      logWarning("Pearson correlation matrix contains NaN values.")


  private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = {
    math.abs(value) <= threshold
Example 52
Source File: SpearmanCorrelation.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD

  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) =>
        ((j, v), uid)
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = { uid =>
          (uid, (preCol, averageRank))
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
        } else {
          cachedUids += uid
      } ++ flush()
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
Example 53
Source File: Updater.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.optimization

import scala.math._

import breeze.linalg.{axpy => brzAxpy, norm => brzNorm, Vector => BV}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.mllib.linalg.{Vector, Vectors}

class SquaredL2Updater extends Updater {
  override def compute(
      weightsOld: Vector,
      gradient: Vector,
      stepSize: Double,
      iter: Int,
      regParam: Double): (Vector, Double) = {
    // add up both updates from the gradient of the loss (= step) as well as
    // the gradient of the regularizer (= regParam * weightsOld)
    // w' = w - thisIterStepSize * (gradient + regParam * w)
    // w' = (1 - thisIterStepSize * regParam) * w - thisIterStepSize * gradient
    val thisIterStepSize = stepSize / math.sqrt(iter)
    val brzWeights: BV[Double] = weightsOld.asBreeze.toDenseVector
    brzWeights :*= (1.0 - thisIterStepSize * regParam)
    brzAxpy(-thisIterStepSize, gradient.asBreeze, brzWeights)
    val norm = brzNorm(brzWeights, 2.0)

    (Vectors.fromBreeze(brzWeights), 0.5 * regParam * norm * norm)
Example 54
Source File: GLMClassificationModel.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.classification.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SparkSession}

    def loadData(sc: SparkContext, path: String, modelClass: String): Data = {
      val dataPath = Loader.dataPath(path)
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val dataRDD =
      val dataArray ="weights", "intercept", "threshold").take(1)
      assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath")
      val data = dataArray(0)
      assert(data.size == 3, s"Unable to load $modelClass data from: $dataPath")
      val (weights, intercept) = data match {
        case Row(weights: Vector, intercept: Double, _) =>
          (weights, intercept)
      val threshold = if (data.isNullAt(2)) {
      } else {
      Data(weights, intercept, threshold)
Example 55
Source File: LabeledPoint.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException

  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
Example 56
Source File: GLMRegressionModel.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SparkSession}

    def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = {
      val dataPath = Loader.dataPath(path)
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val dataRDD =
      val dataArray ="weights", "intercept").take(1)
      assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath")
      val data = dataArray(0)
      assert(data.size == 2, s"Unable to load $modelClass data from: $dataPath")
      data match {
        case Row(weights: Vector, intercept: Double) =>
          assert(weights.size == numFeatures, s"Expected $numFeatures features, but" +
            s" found ${weights.size} features when loading $modelClass weights from $dataPath")
          Data(weights, intercept)

Example 57
Source File: IDFSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

Example 58
Source File: PCASuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect() { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
Example 59
Source File: DLEstimatorBase.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up

import{HasFeaturesCol, HasLabelCol, HasPredictionCol}
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row}

abstract class DLEstimatorBase[Learner <: DLEstimatorBase[Learner, M],
    M <: DLTransformerBase[M]]
  extends Estimator[M] with HasLabelCol {

  protected def internalFit(dataFrame: DataFrame): M

  override def fit(dataFrame: DataFrame): M = {
    transformSchema(dataFrame.schema, logging = true)

  override def copy(extra: ParamMap): Learner = defaultCopy(extra)
Example 60
Source File: TestMPSLinearProgramSolver.scala    From spark-lp   with Apache License 2.0 5 votes vote down vote up
object TestMPSLinearProgramSolver {
  def main(args: Array[String]) {

    val conf = new SparkConf()

    val sc = new SparkContext(conf)

    // Parse the provided MPS file.
    val parser = new MPSParser()
    val mpsFile = new File(args(0))

    // Convert the parsed linear program to standard form.
    val converter = new LPStandardConverter(true)

    // Convert the parameters of the linear program to spark lp compatible formats.
    val numPartitions = 2
    val c: DVector = sc.parallelize(converter.getStandardC.toArray, numPartitions) DenseVector(_))
    val B: DMatrix = sc.parallelize(
      Vectors.dense(_).toSparse: Vector), numPartitions)
    val b = new DenseVector(converter.getStandardB.toArray)
    println("Start solving ... ")
    val (optimalVal, optimalX) = LP.solve(c, B, b, sc=sc)
    println("optimalVal: " + optimalVal)
    //println("optimalX: " + optimalX.collectElements.mkString(", "))

Example 61
Source File: package.scala    From spark-lp   with Apache License 2.0 5 votes vote down vote up
  implicit object DVectorSpace extends VectorSpace[DVector] {

    override def combine(alpha: Double, a: DVector, beta: Double, b: DVector): DVector =
      if (alpha == 1.0 && beta == 1.0) { {
          case (aPart, bPart) => {
            BLAS.axpy(1.0, aPart, bPart) // bPart += aPart
      } else { {
          case (aPart, bPart) =>
            // NOTE A DenseVector result is assumed here (not sparse safe).
            DenseVectorSpace.combine(alpha, aPart, beta, bPart).toDense

    override def dot(a: DVector, b: DVector): Double =

    override def entrywiseProd(a: DVector, b: DVector): DVector = { {
        case (aPart, bPart) =>
          DenseVectorSpace.entrywiseProd(aPart, bPart).toDense

    override def entrywiseNegDiv(a: DVector, b: DVector): DVector = { {
        case (aPart, bPart) =>
            DenseVectorSpace.entrywiseNegDiv(aPart, bPart)

    override def sum(a: DVector): Double = a.aggregate(0.0)(
      seqOp = (acc: Double, v: DenseVector) => acc + v.values.sum,
      combOp = (acc1: Double, acc2: Double) => acc1 + acc2

    override def min(a: DVector): Double = a.aggregate(Double.PositiveInfinity)(
      (mi, x) => Math.min(mi, x.values.min), Math.min

    override def max(a: DVector): Double = a.aggregate(Double.NegativeInfinity)(
      (ma, x) => Math.max(ma, x.values.max), Math.max

    override def cache(a: DVector): Unit =
      if (a.getStorageLevel == StorageLevel.NONE) {
Example 62
Source File: SpLinopMatrix.scala    From spark-lp   with Apache License 2.0 5 votes vote down vote up
  override def apply(mat: DMatrix): DMatrix = {
    dvector.zipPartitions(mat)((vectorPartition, matPartition) => {
          case (a: Double, x: Vector) =>
            val xc = x.copy
              BLAS.scal(a, xc)
Example 63
Source File: InitializeSuite.scala    From spark-lp   with Apache License 2.0 5 votes vote down vote up

package org.apache.spark.mllib.optimization.lp

import org.scalatest.FunSuite

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.optimization.lp.VectorSpace._
import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace
import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _}
import org.apache.spark.mllib.optimization.tfocs.VectorSpace.{DMatrix, DVector}

class InitializeSuite extends FunSuite with MLlibTestSparkContext {

  val numPartitions = 2
  val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0)
  val BArray = Array(
    Array(12.0, 16.0, 30.0, 1.0, 0.0),
    Array(24.0, 16.0, 12.0, 0.0, 1.0),
    Array(-1.0, 0.0, 0.0, 0.0, 0.0),
    Array(0.0, -1.0, 0.0, 0.0, 0.0),
    Array(0.0, 0.0, -1.0, 0.0, 0.0),
    Array(0.0, 0.0, 0.0, 1.0, 0.0),
    Array(0.0, 0.0, 0.0, 0.0, 1.0))
  val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0)

  lazy val c: DVector = sc.parallelize(cArray, numPartitions) DenseVector(_))
  lazy val rows: DMatrix = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_))
  lazy val b: DenseVector = new DenseVector(bArray)

  val cBrz = new BDV[Double](cArray)
  val BBrz = new BDM[Double](7, 5,
    BArray.flatMap(x => x),
    offset = 0,
    majorStride = 5,
    isTranspose = true)
  val bBrz = new BDV[Double](bArray)
  // (BT * B) ^(-1)
  val BTBInv = inv(BBrz.t * BBrz)
  // xTilda = B * BTBInv * b
  val xTilda: BDV[Double] = BBrz * (BTBInv * bBrz)
  // lambdaTilda = BTBInv * (B^T * c)
  val lambdaTilda: BDV[Double] = BTBInv * (BBrz.t * cBrz)
  // sTilda = c - B * lambdaTilda
  val sTilda = cBrz - BBrz * lambdaTilda
  val deltax = Math.max(1.5 * max(xTilda), 0)
  val deltas = Math.max(1.5 * max(sTilda), 0)
  val xHat = xTilda :+ deltax
  val sHat = sTilda :+ deltas
  val deltaxHat: Double = 0.5 * (xHat.t * sHat) / sum(sHat)
  val deltasHat: Double = 0.5 * (xHat.t * sHat) / sum(xHat)
  // x = xHat + deltaxHat * e
  val expectedx: BDV[Double] = xHat :+ deltaxHat
  // val expectedLambda = lambdaTilda
  val expecteds: BDV[Double] = sHat :+ deltasHat

  test("Initialize.init is implemented properly") {

    val result = Initialize.init(c, rows, b)
    //println(LP.solve(c, rows, b, 1e-4, 1).collect())
    assert(Vectors.dense(expectedx.toArray) ~= Vectors.dense(result._1.flatMap(_.toArray).collect()) relTol 1e-6,
      "Initialize.init x0 is not computed correctly.")
    assert(Vectors.dense(lambdaTilda.toArray) ~= Vectors.dense(result._2.toArray) relTol 1e-6,
      "Initialize.init lambda0 is not computed correctly.")
    assert(Vectors.dense(expecteds.toArray) ~= Vectors.dense(result._3.flatMap(_.toArray).collect()) relTol 1e-6,
      "Initialize.init s0 should return the correct answer.")
Example 64
Source File: LPSuite.scala    From spark-lp   with Apache License 2.0 5 votes vote down vote up

package org.apache.spark.mllib.optimization.lp

import org.scalatest.FunSuite

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.optimization.lp.VectorSpace._
import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace

class LPSuite extends FunSuite with MLlibTestSparkContext {

  val numPartitions = 2
  val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0)
  val BArray = Array(
    Array(12.0, 16.0, 30.0, 1.0, 0.0),
    Array(24.0, 16.0, 12.0, 0.0, 1.0),
    Array(-1.0, 0.0, 0.0, 0.0, 0.0),
    Array(0.0, -1.0, 0.0, 0.0, 0.0),
    Array(0.0, 0.0, -1.0, 0.0, 0.0),
    Array(0.0, 0.0, 0.0, 1.0, 0.0),
    Array(0.0, 0.0, 0.0, 0.0, 1.0))
  val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0)

  lazy val c = sc.parallelize(cArray, numPartitions) DenseVector(_))
  lazy val rows = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_))
  lazy val b = new DenseVector(bArray)

  test("LP solve is implemented properly") {
    val (v, x) = LP.solve(c, rows, b, sc=sc)
    // solution obtained from scipy.optimize.linprog and octave glgk lpsolver with fun_val = 12.083
    val expectedSol = Vectors.dense(
      Array(1.66666667, 5.83333333, 40.0, 0.0, 0.0, 13.33333333, 9.16666667))
    val xx = Vectors.dense(x.flatMap(_.toArray).collect())
    println("optimal min value: " + v)
    assert(xx ~== expectedSol absTol 1e-6, "LP.solve x should return the correct answer.")


Example 65
Source File: BisectingKMeansModel.scala    From bisecting-kmeans   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.bisectingkmeans

import breeze.linalg.{Vector => BV, norm => breezeNorm}

import org.apache.spark.Logging
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD

  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
    this.node.toLinkageMatrix.foreach {x =>
      val row = new java.util.ArrayList[java.lang.Double]()
Example 66
Source File: TestMPSLinearProgram.scala    From spark-tfocs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.optimization.tfocs.examples


import com.joptimizer.optimizers.LPStandardConverter
import com.joptimizer.util.MPSParser

import org.apache.spark.mllib.linalg.{ DenseVector, Vector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.SolverSLP
import org.apache.spark.{ SparkConf, SparkContext }

object TestMPSLinearProgram {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestMPSLinearProgram")
    val sc = new SparkContext(sparkConf)

    // Parse the provided MPS file.
    val parser = new MPSParser()
    var mpsFile = new File(args(0))

    // Convert the parsed linear program to standard form.
    val converter = new LPStandardConverter(true)

    // Convert the parameters of the linear program to spark tfocs compatible formats.
    val c = sc.parallelize(converter.getStandardC.toArray) DenseVector(_))
    val A = sc.parallelize(
      Vectors.dense(_).toSparse: Vector))
    val b = new DenseVector(converter.getStandardB.toArray)
    val n = converter.getStandardN

    val mu = 1e-2

    // Solve the linear program using SolverSLP, finding the optimal x vector 'optimalX'.
    val (optimalX, _) =, A, b, mu)
    println("optimalX: " + optimalX.collectElements.mkString(", "))

Example 67
Source File: LocalLDAModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.clustering

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.{DataUtils, ParamUtils}
import{LocalLDAModel => SparkLocalLDA}
import org.apache.spark.mllib.clustering.{LocalLDAModel => OldSparkLocalLDA}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
import org.apache.spark.sql.SparkSession
import DataUtils._
import scala.reflect.runtime.universe

class LocalLDAModel(override val sparkTransformer: SparkLocalLDA)
  extends LocalTransformer[SparkLocalLDA] {

  lazy val oldModel: OldSparkLocalLDA = {
    val mirror     = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader)
    val parentTerm = universe.typeOf[SparkLocalLDA].decl(universe.TermName("oldLocalModel")).asTerm

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getFeaturesCol) match {
      case Some(column) =>
        val newData =
      case None => localData

object LocalLDAModel
  extends SimpleModelLoader[SparkLocalLDA]
  with TypedTransformerConverter[SparkLocalLDA] {

  override def build(metadata: Metadata, data: LocalData): SparkLocalLDA = {
    val topics = DataUtils.constructMatrix(
      data.column("topicsMatrix")[Map[String, Any]]
    val gammaShape = data.column("gammaShape")[java.lang.Double]
    val topicConcentration =
    val docConcentration = DataUtils.constructVector(
      data.column("docConcentration")[Map[String, Any]]
    val vocabSize = data.column("vocabSize")[java.lang.Integer]

    val oldLdaCtor = classOf[OldSparkLocalLDA].getDeclaredConstructor(
    val oldLDA = oldLdaCtor.newInstance(

    val ldaCtor = classOf[SparkLocalLDA].getDeclaredConstructor(

    val lda = ldaCtor.newInstance(metadata.uid, vocabSize, oldLDA, null)

    ParamUtils.set(lda, lda.optimizer, metadata)
    ParamUtils.set(lda, lda.keepLastCheckpoint, metadata)
    ParamUtils.set(lda, lda.seed, metadata)
    ParamUtils.set(lda, lda.featuresCol, metadata)
    ParamUtils.set(lda, lda.learningDecay, metadata)
    ParamUtils.set(lda, lda.checkpointInterval, metadata)
    ParamUtils.set(lda, lda.learningOffset, metadata)
    ParamUtils.set(lda, lda.maxIter, metadata)
    ParamUtils.set(lda, lda.k, metadata)

  override implicit def toLocal(sparkTransformer: SparkLocalLDA): LocalTransformer[SparkLocalLDA] =
    new LocalLDAModel(sparkTransformer)

Example 68
Source File: MlLibOnKudu.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu

import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)"org.apache.kudu.spark.kudu").load.

    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip =

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
Example 69
Source File: Util.scala    From spark-twitter-sentiment   with Apache License 2.0 5 votes vote down vote up
package com.dhruv

import org.apache.commons.cli.{Options, ParseException, PosixParser}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.feature.HashingTF
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder

object Utils {

  val numFeatures = 1000
  val tf = new HashingTF(numFeatures)

  val CONSUMER_KEY = "consumerKey"
  val CONSUMER_SECRET = "consumerSecret"
  val ACCESS_TOKEN = "accessToken"
  val ACCESS_TOKEN_SECRET = "accessTokenSecret"

  val THE_OPTIONS = {
    val options = new Options()
    options.addOption(CONSUMER_KEY, true, "Twitter OAuth Consumer Key")
    options.addOption(CONSUMER_SECRET, true, "Twitter OAuth Consumer Secret")
    options.addOption(ACCESS_TOKEN, true, "Twitter OAuth Access Token")
    options.addOption(ACCESS_TOKEN_SECRET, true, "Twitter OAuth Access Token Secret")

  def parseCommandLineWithTwitterCredentials(args: Array[String]) = {
    val parser = new PosixParser
    try {
      val cl = parser.parse(THE_OPTIONS, args)
      System.setProperty("twitter4j.oauth.consumerKey", cl.getOptionValue(CONSUMER_KEY))
      System.setProperty("twitter4j.oauth.consumerSecret", cl.getOptionValue(CONSUMER_SECRET))
      System.setProperty("twitter4j.oauth.accessToken", cl.getOptionValue(ACCESS_TOKEN))
      System.setProperty("twitter4j.oauth.accessTokenSecret", cl.getOptionValue(ACCESS_TOKEN_SECRET))
    } catch {
      case e: ParseException =>
        System.err.println("Parsing failed.  Reason: " + e.getMessage)

  def getAuth = {
    Some(new OAuthAuthorization(new ConfigurationBuilder().build()))

  def featurize(s: String): Vector = {

  object IntParam {
    def unapply(str: String): Option[Int] = {
      try {
      } catch {
        case e: NumberFormatException => None
Example 70
Source File: Tabulizer.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 5 votes vote down vote up
package com.packtpub.mmlwspark.utils

import org.apache.spark.mllib.linalg.Vector

  def table(vector: Vector, cols: Int, format: String = "%.3f"): String =
    table(, cols, None)

  def table(list: Seq[Any], cols: Int, header: Option[Seq[String]]): String =
    table(tblize( ++ list).getOrElse(list), cols), header.isDefined)

  def table(cells: Seq[Seq[Any]], header: Boolean): String = {
    val colSizes = cells
      .map( => if (v != null) v.toString.length else 1))
      .reduce((v1, v2) => { case (v1, v2) => if (v1 > v2) v1 else v2 })
    val rowSeparator ="-" * _).mkString("+", "+", "+")
    def valueFormatter(v: Any, size: Int): String =
      ("%" + size + "s").format(if (v != null) v else "-")
    val rows = cells
      .map(row =>
        .map { case (v, size) => valueFormatter(v, size) }.mkString("|", "|", "|"))
    if (header)

  def tblize(list: Seq[Product], horizontal: Boolean, cols: Int): Seq[Seq[Any]] = {
    val arity = list.head.productArity
    tblize(list.flatMap(_.productIterator.toList), cols = arity * cols)

  def tblize(list: Seq[Any], cols: Int = 4): Seq[Seq[Any]] = {
    val nrow = list.length / cols + (if (list.length % cols == 0) 0 else 1)
    list.sliding(cols, cols)
      .map(s => if (s.length == cols || s.length == list.length) s else s.padTo(cols, null))
      .foldLeft(Seq[Seq[Any]]()) { case (a, s) => a ++ Seq(s) }
Example 71
Source File: X2PHelper.scala    From spark-tsne   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib

import breeze.linalg._
import breeze.numerics._
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLUtils

object X2PHelper {

  case class VectorWithNorm(vector: Vector, norm: Double)

  def fastSquaredDistance(v1: VectorWithNorm, v2: VectorWithNorm): Double = {
    MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm)

  def Hbeta(D: DenseVector[Double], beta: Double = 1.0) : (Double, DenseVector[Double]) = {
    val P: DenseVector[Double] = exp(- D * beta)
    val sumP = sum(P)
    if(sumP == 0) {
      (0.0, DenseVector.zeros(D.size))
    }else {
      val H = log(sumP) + (beta * sum(D :* P) / sumP)
      (H, P / sumP)
package com.github.saurfang.spark.tsne

import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}

class BugDemonstrationTest extends FunSuite with Matchers with BeforeAndAfterAll {
  private var sparkSession : SparkSession = _
  override def beforeAll(): Unit = {
    sparkSession = SparkSession.builder().appName("BugTests").master("local[2]").getOrCreate()

  override def afterAll(): Unit = {

  test("This demonstrates a bug was fixed in tsne-spark 2.1") {
    val sc = sparkSession.sparkContext

    val observations = sc.parallelize(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    val expectedMean = Vectors.dense(2.0,20.0,200.0)
    val resultMean = summary.mean
    assertEqualEnough(resultMean, expectedMean)
    val expectedVariance = Vectors.dense(1.0,100.0,10000.0)
    assertEqualEnough(summary.variance, expectedVariance)
    val expectedNumNonZeros = Vectors.dense(3.0, 3.0, 3.0)
    assertEqualEnough(summary.numNonzeros, expectedNumNonZeros)

  private def assertEqualEnough(sample: Vector, expected: Vector): Unit = {
    expected.toArray.zipWithIndex.foreach{ case(d: Double, i: Int) =>
      sample(i) should be (d +- 1E-12)
import java.util.Random

import scala.language.implicitConversions

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import{Oracle, VectorSpace}
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.{RDD, UnionRDD}

  private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = {
    data.cartesian(dx).map { case (points, x) =>
      val g = Vectors.zeros(x.size)
      points.foreach { case LabeledPoint(b, a) =>
        val err =, x) - b
        BLAS.axpy(err, a, g)

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val n = 1000
    val p = 100
    val random = new Random(0L)
    val xExact = Vectors.dense(Array.fill(p)(random.nextDouble()))
    val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) =>
      val random = new Random(100 + idx) { v =>
        val target =, xExact) + 0.1 * random.nextGaussian()
        LabeledPoint(target, v)

    val x = solve(data).first()

    println(s"x_exact = $xExact")
    println(s"x_vlbfgs = $x")

package org.apache.spark.sql.hive

import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, DataFrame, Row, UserDefinedFunction}

object HivemallUtils {

  // # of maximum dimensions for feature vectors
  val maxDims = 100000000

  def funcVectorizer(dense: Boolean = false, dims: Int = maxDims)
    : UserDefinedFunction = {
    udf(funcVectorizerImpl(dense, dims))

  private def funcVectorizerImpl(dense: Boolean, dims: Int)
    : Seq[String] => Vector = {
    if (dense) {
      // Dense features
      i: Seq[String] => {
        val features = new Array[Double](dims) { ft =>
          val s = ft.split(":").ensuring(_.size == 2)
          features(s(0).toInt) = s(1).toDouble
    } else {
      // Sparse features
      i: Seq[String] => {
        val features = { ft =>
          // val s = ft.split(":").ensuring(_.size == 2)
          val s = ft.split(":")
          (s(0).toInt, s(1).toDouble)
        Vectors.sparse(dims, features)
package se.uu.farmbio.cp.liblinear

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import se.uu.farmbio.cp.ICPClassifierModel
import org.apache.commons.lang.NotImplementedException
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.SparkContext

object AggregatedICPClassifier {

  def load(path: String, sc: SparkContext) = {
    val icps = sc.textFile(path)
      .map(ICPClassifierModel.deserialize(_, LibLinAlgDeserializer))
    new AggregatedICPClassifier(icps)


class AggregatedICPClassifier(
  private val icps: RDD[ICPClassifierModel[LibLinAlg]])
  extends ICPClassifierModel[LibLinAlg] {

  val cachedICPs = icps.cache

  override def mondrianPv(features: Vector) = {
      .flatMap { icp =>
      .collect //we expect to aggregate up to 100 ICPs
      .map {
        case (index, seq) =>
          val sortedSeq =
          val n = sortedSeq.length
          val median = if (n % 2 == 0) {
            (sortedSeq(n / 2 - 1) + sortedSeq(n / 2)) / 2
          } else {
            sortedSeq(n / 2)

  def save(path: String, coalesce: Int = 0) = {
    var serialICPs =
    if (coalesce > 0) {
      serialICPs = serialICPs.coalesce(coalesce)

package se.uu.farmbio.cp.liblinear

import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import de.bwaldvogel.liblinear.Feature
import de.bwaldvogel.liblinear.FeatureNode
import de.bwaldvogel.liblinear.Linear
import de.bwaldvogel.liblinear.Parameter
import de.bwaldvogel.liblinear.Problem
import de.bwaldvogel.liblinear.SolverType
import se.uu.farmbio.cp.UnderlyingAlgorithm
import se.uu.farmbio.cp.Deserializer

object LibLinAlg {

  private def vectorToFeatures(v: Vector) = {
    val indices = v.toSparse.indices
    val values = v.toSparse.values
      .sortBy {
        case (i, v) => i
      .map {
        case (i, v) => new FeatureNode(i + 1, v)

  private def train(
    input: Array[LabeledPoint],
    solverType: SolverType,
    c: Double,
    tol: Double) = {

    //configure problem
    val problem = new Problem
    problem.l = input.length
    problem.n = input(0).features.size
    problem.x = { p =>
    problem.y = + 1.0)
    problem.bias = -1.0

    val parameter = new Parameter(solverType, c, tol)
    val libLinModel = Linear.train(problem, parameter)

    //convert to Spark SVMModel
    val weights = libLinModel.getFeatureWeights
    val intercept = libLinModel.getBias
    val svmModel = new SVMModel(Vectors.dense(weights).toSparse, intercept)



object LibLinAlgDeserializer extends Deserializer[LibLinAlg] {
  override def deserialize(alg: String) = {
    val splitted = alg.split(",", 2)
    val intercept = splitted(0)
    val weights = splitted(1)
    val model = new SVMModel(Vectors.parse(weights).toSparse, intercept.toDouble)
    new LibLinAlg(model)

class LibLinAlg(
  val svmModel: SVMModel)
  extends UnderlyingAlgorithm(
    (features: Vector) => svmModel.predict(features)) {

  def this(
    training: Array[LabeledPoint],
    solverType: SolverType,
    regParam: Double,
    tol: Double) = {
    this(LibLinAlg.train(training, solverType, regParam, tol))

  override def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
    } else {

  override def toString = {
    this.svmModel.intercept + "," +

package se.uu.farmbio.cp.alg

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.loss.LogLoss
import org.apache.spark.rdd.RDD

import se.uu.farmbio.cp.UnderlyingAlgorithm

//Define a GBTs UnderlyingAlgorithm
private object GBT {
  def trainingProcedure(
      input: RDD[LabeledPoint], 
      numIterations: Int): (Vector => Double) = {
    val boostingStrategy = BoostingStrategy.defaultParams("Regression")
    boostingStrategy.numIterations = numIterations
    boostingStrategy.treeStrategy.maxDepth = 5
    boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
    boostingStrategy.loss = LogLoss
    val remappedInput = => new LabeledPoint((x.label * 2) - 1, x.features))
    val model = new GradientBoostedTrees(boostingStrategy)
      .run(input = remappedInput)

class GBT(
  private val input: RDD[LabeledPoint],
  private val numIterations: Int)
  extends UnderlyingAlgorithm(
      GBT.trainingProcedure(input,numIterations)) {
  override def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
    } else {
Example 78
import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.LBFGS
import org.apache.spark.mllib.optimization.LogisticGradient
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD

import se.uu.farmbio.cp.UnderlyingAlgorithm

//Define a LogisticRegression UnderlyingAlgorithm
private object LogisticRegression {
  def trainingProcedure(
    input: RDD[LabeledPoint],
    maxNumItearations: Int,
    regParam: Double,
    numCorrections: Int,
    convergenceTol: Double): (Vector => Double) = {

    //Train Logistic Regression with LBFGS
    val numFeatures = input.take(1)(0).features.size
    val training = => (x.label, MLUtils.appendBias(x.features))).cache()
    val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1))
    val (weightsWithIntercept, _) = LBFGS.runLBFGS(
      new LogisticGradient(),
      new SquaredL2Updater(),

    //Create the model using the weights
    val model = new LogisticRegressionModel(
      Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)),
      weightsWithIntercept(weightsWithIntercept.size - 1))

    //Return raw score predictor


class LogisticRegression(
  private val input: RDD[LabeledPoint],
  private val maxNumItearations: Int = 100,
  private val regParam: Double = 0.1,
  private val numCorrections: Int = 10,
  private val convergenceTol: Double = 1e-4)
  extends UnderlyingAlgorithm(
      convergenceTol)) {
  override def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
    } else {
Example 79
import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import org.apache.spark.mllib.feature.{PCA => SPCA}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.dizhang.seqspark.ds.{DenseCounter, Genotype, SparseCounter}
import org.dizhang.seqspark.util.General._
import org.dizhang.seqspark.worker.Data
import org.slf4j.LoggerFactory

  def pc(n: Int): BDM[Double] = {
    val model = new SPCA(n)
    val data = this.prepare
    if (data.isEmpty()) {
      new BDM[Double](0, 0)
    } else {
      val res =
      new BDM(res.length/n, n, res)
Example 80
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}

object Util {
  val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/"
  val DATA_PATH= "../../../data/ml-100k"
  val PATH_MOVIES = DATA_PATH + "/u.item"

  def reduceDimension2(x: Vector) : String= {
    var i = 0
    var l = x.toArray.size
    var l_2 = l/2.toInt
    var x_ = 0.0
    var y_ = 0.0

    for(i <- 0 until l_2) {
      x_ += x(i).toDouble
    for(i <- (l_2 + 1) until l) {
      y_ += x(i).toDouble
    var t = x_ + "," + y_
    return t

  def getMovieDataDF(spark : SparkSession) : DataFrame = {

    //1|Toy Story (1995)|01-Jan-1995||
    // |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
    val customSchema = StructType(Array(
      StructField("id", StringType, true),
      StructField("name", StringType, true),
      StructField("date", StringType, true),
      StructField("url", StringType, true)));
    val movieDf ="com.databricks.spark.csv")
      .option("delimiter", "|").schema(customSchema)
    return movieDf

Example 81
import scala.collection.mutable
import org.apache.spark.mllib.clustering.LDA
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

object LDATextExample {
  val PATH = "/home/ubuntu/work/spark-src/spark/"
  val sc = new SparkContext("local[2]", "First Spark App")
  def main(args: Array[String]): Unit = {
    // Load documents from text files, 1 document per file
    val corpus: RDD[String] = sc.wholeTextFiles(PATH + "docs/*.md").map(_._2)
    // Split each document into a sequence of terms (words)
    val tokenized: RDD[Seq[String]] ="\\s")).map(_.filter(_.length > 3).
    // Choose the vocabulary.
    // termCounts: Sorted list of (term, termCount) pairs
    val termCounts: Array[(String, Long)] =
      tokenized.flatMap( -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2)
    // vocabArray: Chosen vocab (removing common terms)
    val numStopwords = 20
    val vocabArray: Array[String] =
      termCounts.takeRight(termCounts.size - numStopwords).map(_._1)
    // vocab: Map term -> term index
    val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
    // Convert documents into term count vectors
    val documents: RDD[(Long, Vector)] = { case (tokens, id) =>
        val counts = new mutable.HashMap[Int, Double]()
        tokens.foreach { term =>
          if (vocab.contains(term)) {
            val idx = vocab(term)
            counts(idx) = counts.getOrElse(idx, 0.0) + 1.0
        (id, Vectors.sparse(vocab.size, counts.toSeq))
    // Set LDA parameters
    val numTopics = 10
    val lda = new LDA().setK(numTopics).setMaxIterations(10)
    val ldaModel =
    //val avgLogLikelihood = ldaModel. / documents.count()
    // Print topics, showing top-weighted 10 terms for each topic.
    val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10)
    topicIndices.foreach { case (terms, termWeights) =>
      println("TOPIC:") { case (term, weight) =>
Example 82
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors}
object SparkSVDExampleOne {

  def main(args: Array[String]) {
    val denseData = Seq(
      Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1),
      Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3),
      Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8),
      Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0)
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo")
    val sc = new SparkContext(spConfig)
    val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2))

    // Compute the top 20 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true)
    val U: RowMatrix = svd.U // The U factor is a RowMatrix.
    val s: Vector = svd.s // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V // The V factor is a local dense matrix.
    println("U:" + U)
    println("s:" + s)
    println("V:" + V)
Example 83
import org.apache.spark.mllib.linalg.Vector

abstract class PerCoordinateUpdater extends Serializable {
  def compute(
      weightsOld: Vector,
      gradient: Vector,
      alpha: Double,
      beta: Double,
      l1: Double,
      l2: Double,
      n: Vector,
      z: Vector): (Vector, Double, Vector, Vector)
Example 84
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object SVDExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SVDExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 5 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V  // The V factor is a local dense matrix.
    // $example off$
    val collect = U.rows.collect()
    println("U factor is:")
    collect.foreach { vector => println(vector) }
    println(s"Singular values are: $s")
    println(s"V factor is:\n$V")
// scalastyle:on println 
// scalastyle:off println
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
// $example off$

object BisectingKMeansExample {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
    val sc = new SparkContext(sparkConf)

    // $example on$
    // Loads and parses data
    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()

    // Clustering the data into 6 clusters by BisectingKMeans.
    val bkm = new BisectingKMeans().setK(6)
    val model =

    // Show the compute cost and the cluster centers
    println(s"Compute Cost: ${model.computeCost(data)}")
    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
      println(s"Cluster Center ${idx}: ${center}")
    // $example off$

package org.apache.spark.mllib.feature

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}

  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.

Example 87
import scala.collection.JavaConverters

import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  val gaussians: Array[Byte] = {
    val modelGaussians = { gaussian =>
      Array[Any](, gaussian.sigma)

  def predictSoft(point: Vector): Vector = {

  def save(sc: SparkContext, path: String): Unit =, path)
Example 88
Source File: Word2VecModelWrapper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(
    val words =
    List(words, similarity).map(_.asInstanceOf[Object]).asJava

  def getVectors: JMap[String, JList[Float]] = { { case (k, v) =>
      (k, v.toList.asJava)

  def save(sc: SparkContext, path: String): Unit =, path)
Example 89
import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
    val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]]
    val n = cov.cols

    // Compute the standard deviation on the diagonals first
    var i = 0
    while (i < n) {
      // TODO remove once covariance numerical issue resolved.
      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
      i +=1

    // Loop through columns since cov is column major
    var j = 0
    var sigma = 0.0
    var containNaN = false
    while (j < n) {
      sigma = cov(j, j)
      i = 0
      while (i < j) {
        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
          containNaN = true
        } else {
          cov(i, j) / (sigma * cov(i, i))
        cov(i, j) = corr
        cov(j, i) = corr
        i += 1
      j += 1

    // put 1.0 on the diagonals
    i = 0
    while (i < n) {
      cov(i, i) = 1.0
      i +=1

    if (containNaN) {
      logWarning("Pearson correlation matrix contains NaN values.")


  private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = {
    math.abs(value) <= threshold
Example 90
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD

  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) =>
        ((j, v), uid)
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = { uid =>
          (uid, (preCol, averageRank))
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
        } else {
          cachedUids += uid
      } ++ flush()
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
Example 91
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SparkSession}

    def loadData(sc: SparkContext, path: String, modelClass: String): Data = {
      val dataPath = Loader.dataPath(path)
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val dataRDD =
      val dataArray ="weights", "intercept", "threshold").take(1)
      assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath")
      val data = dataArray(0)
      assert(data.size == 3, s"Unable to load $modelClass data from: $dataPath")
      val (weights, intercept) = data match {
        case Row(weights: Vector, intercept: Double, _) =>
          (weights, intercept)
      val threshold = if (data.isNullAt(2)) {
      } else {
      Data(weights, intercept, threshold)
Example 92
import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException

  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
Example 93
Source File: GLMRegressionModel.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SparkSession}

    def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = {
      val dataPath = Loader.dataPath(path)
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val dataRDD =
      val dataArray ="weights", "intercept").take(1)
      assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath")
      val data = dataArray(0)
      assert(data.size == 2, s"Unable to load $modelClass data from: $dataPath")
      data match {
        case Row(weights: Vector, intercept: Double) =>
          assert(weights.size == numFeatures, s"Expected $numFeatures features, but" +
            s" found ${weights.size} features when loading $modelClass weights from $dataPath")
          Data(weights, intercept)

Example 94
Source File: IDFSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

Example 95
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect() { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
Example 96
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary}

class TimeSeriesSmallModelRegressionMetrics(
  idPredictionsAndObservations: Array[(Double, Double)]
) {

  private lazy val summary: MultivariateStatisticalSummary = {
      case (observation, prediction) =>
        Vectors.dense(observation, observation - prediction)
    }.aggregate(new MultivariateOnlineSummarizer())(
      (summary, current) => summary.add(org.apache.spark.mllib.linalg.Vectors.fromML(current)),
      (sum1, sum2) => sum1.merge(sum2)

  private lazy val SSerr = math.pow(summary.normL2(1), 2)
  private lazy val SStot = summary.variance(0) * (summary.count - 1)
  private lazy val SSreg = {
    val yMean = summary.mean(0) {
      case (prediction, observation) => math.pow(prediction - yMean, 2)

  def explainedVariance = SSreg / summary.count

  def meanAbsoluteError = summary.normL1(1) / summary.count

  def meanSquaredError = SSerr / summary.count

  def rootMeanSquaredPercentageError = math.sqrt( {
    case (observation, prediction) => if (observation == 0) {
    } else {
      Math.pow((observation - prediction) / observation, 2)
  }.sum / summary.count)

  def rootMeanSquaredError = math.sqrt(meanSquaredError)

  def r2 = 1 - (SSerr / SStot)

Example 97
import org.apache.spark.mllib.linalg.Vector

class UberHoltWintersModel(override val period: Int,
                           override val alpha: Double,
                           override val beta: Double,
                           override val gamma: Double, override val modelType: String = "additive")
    extends HoltWintersModel(modelType,period, alpha, beta, gamma) {
  lazy val params = Map(
    "HoltWintersAlpha" -> alpha.toString,
    "HoltWintersBeta" -> beta.toString,
    "HoltWintersGamma" -> gamma.toString

object UberHoltWintersModel {
  def fitModelWithBOBYQA(ts: Vector,
                         m: Int, modelType: String = "additive"): UberHoltWintersModel = {
    val model = HoltWinters.fitModelWithBOBYQA(ts, m, modelType)
    new UberHoltWintersModel(
Example 98
Source File: UberArimaModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
class UberArimaModel(override val p: scala.Int,
                     override val d: scala.Int,
                     override val q: scala.Int,
                     override val coefficients: scala.Array[scala.Double],
                     override val hasIntercept: scala.Boolean = true)
    extends ARIMAModel(p, q, d, coefficients, hasIntercept) {
  lazy val params =
    Map("ArimaP" -> p.toString, "ArimaD" -> d.toString, "ArimaQ" -> q.toString)


object UberArimaModel {
  def fitModel(p: Int,
               d: Int,
               q: Int,
               ts: Vector,
               includeIntercept: Boolean = true,
               method: String = "css-cgd",
               userInitParams: Array[Double] = null): UberArimaModel = {
    val model =
      ARIMA.fitModel(p, d, q, ts, includeIntercept, method, userInitParams)
    new UberArimaModel(p, d, q, model.coefficients, model.hasIntercept)
Example 99
Source File: RandomProjection.scala    From spark-neighbors   with MIT License 5 votes vote down vote up
import breeze.stats.distributions.CauchyDistribution
import org.apache.spark.mllib.linalg.{ DenseMatrix, Matrices }
import org.apache.spark.mllib.linalg.{ DenseVector, Vector }

  def generateGaussian(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = {
    val localMatrix = DenseMatrix.randn(projectedDim, originalDim, random)
    new RandomProjection(localMatrix)

  def generateCauchy(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = {
    def randc(numRows: Int, numCols: Int): DenseMatrix = {
        numRows.toLong * numCols <= Int.MaxValue,
        s"$numRows x $numCols dense matrix is too large to allocate"
      val cauchyDistribution = new CauchyDistribution(0, 1)
      new DenseMatrix(numRows, numCols, cauchyDistribution.drawMany(numRows * numCols))

    val localMatrix = randc(projectedDim, originalDim)
    new RandomProjection(localMatrix)
Example 100
Source File: ProbabilisticClassifierConfig.scala    From pu4spark   with Apache License 2.0 5 votes vote down vote up
sealed trait ProbabilisticClassifierConfig

case class LogisticRegressionConfig(maxIter: Int = 100,
                                    regParam: Double = 1.0e-8,
                                    elasticNetParam: Double = 0.0)
  extends ProbabilisticClassifierConfig {
  def build(): ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel] = {
    new LogisticRegression()

case class RandomForestConfig(numTrees: Int = 512)
  extends ProbabilisticClassifierConfig {
  def build(): ProbabilisticClassifier[Vector, RandomForestClassifier, RandomForestClassificationModel] = {
    new RandomForestClassifier()

object ProbabilisticClassifierConfig {
  val labelName = "label"
  val featuresName = "indexedFeatures"
  val subclasses = List(classOf[LogisticRegressionConfig], classOf[RandomForestConfig])
Example 101
Source File: Test_example_CNN.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{ Vector, Vectors }
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV,
  axpy => brzAxpy,
  svd => brzSvd,
  max => Bmax,
  min => Bmin,
  sum => Bsum
import scala.collection.mutable.ArrayBuffer
import CNN.CNN

object Test_example_CNN {

  def main(args: Array[String]) {
    //1 ����Spark����
    val conf = new SparkConf().setAppName("CNNtest")
    val sc = new SparkContext(conf)

    //2 ��������
    val data_path = "/deeplearn/train_d3.txt"
    val examples = sc.textFile(data_path).cache()
    val train_d1 = { line =>
      val f1 = line.split("\t")
      val f = => f.toDouble)
      val y = f.slice(0, 10)
      val x = f.slice(10, f.length)
      (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0)
    val train_d = => (f._1, f._2))
    //3 ����ѵ������������ģ��
    // opts:��������������������������֤����
    val opts = Array(50.0, 1.0, 0.0)
    val numExamples = train_d.count()
    println(s"numExamples = $numExamples.")
    val CNNmodel = new CNN().
      setMapsize(new BDM(1, 2, Array(28.0, 28.0))).
      setTypes(Array("i", "c", "s", "c", "s")).
      setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)).
      setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)).
      setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)).
      CNNtrain(train_d, opts)

    //4 ģ�Ͳ���
    val CNNforecast = CNNmodel.predict(train_d)
    val CNNerror = CNNmodel.Loss(CNNforecast)
    println(s"NNerror = $CNNerror.")
    val printf1 = => (,
    for (i <- 0 until printf1.length) {
      val outi = printf1(i)._2.mkString("\t")

Example 102
Source File: KMeanTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random

//spark/bin/spark-submit --master spark:// --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

//guale spark/bin/spark-submit --master spark:// --class  ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15

object ScalableKMeanTest {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
    } else {
      println("running mllib kmeans")
      val model = new KMeans()

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println("final clusters: " + model.clusterCenters.length)
    println( => v.numNonzeros).mkString("\n"))


Example 103
Source File: lda-script.scala    From practical-data-science-with-hadoop-and-spark   with Apache License 2.0 5 votes vote down vote up
import scala.collection.mutable


import org.apache.spark.rdd._
import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA}
import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors}
import org.apache.spark.mllib.feature.IDF

// add openNLP jar to the Spark Context 

// Load documents from text files, 1 element (text string) per file
val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2)

// read stop words from file
val stopwordFile = "stop-words.txt"
val st_words = sc.textFile(stopwordFile).collect()
val stopwords = sc.broadcast(st_words)

val minWordLength = 3
val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => 
    val tokenizer = SimpleTokenizer.INSTANCE
    val stemmer = new PorterStemmer()    
    val tokens = tokenizer.tokenize(text)
    val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w)))
                      .map(w => stemmer.stem(w))
    id -> words
}.filter(_._2.length > 0)

val numDocs = tokenized.count()

val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => -> 1L) 
}.reduceByKey(_ + _)
val fullVocabSize = wordCounts.count()
val vSize = 10000
val (vocab: Map[String, Int], selectedTokenCount: Long) = {
    val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)}

val documents = { case (id, tokens) =>
    // Filter tokens by vocabulary, and create word count vector representation of document.
    val wc = new mutable.HashMap[Int, Int]()
    tokens.foreach { term =>
        if (vocab.contains(term)) {
          val termIndex = vocab(term)
          wc(termIndex) = wc.getOrElse(termIndex, 0) + 1
    val indices = wc.keys.toArray.sorted
    val values = => wc(i).toDouble)
    val sb = Vectors.sparse(vocab.size, indices, values)
    (id, sb)

val vocabArray = new Array[String](vocab.size)
vocab.foreach { case (term, i) => vocabArray(i) = term }

val tf = { case (id, vec) => vec }.cache()
val idfVals = new IDF().fit(tf).idf.toArray
val tfidfDocs: RDD[(Long, Vector)] = { case (id, vec) =>
    val indices = vec.asInstanceOf[SparseVector].indices
    val counts = new mutable.HashMap[Int, Double]()    
    for (idx <- indices) {
        counts(idx) = vec(idx) * idfVals(idx)
    (id, Vectors.sparse(vocab.size, counts.toSeq))

val numTopics = 5
val numIterations = 50
val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online")
val ldaModel =

val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5)
topicIndices.foreach { case (terms, termWeights) =>
    println("TOPIC:") { case (term, weight) =>
Example 104
Source File: Autoregression.scala    From spark-timeseries   with Apache License 2.0 5 votes vote down vote up
import com.cloudera.sparkts.MatrixUtil.{matToRowArrs, toBreeze}
import org.apache.commons.math3.random.RandomGenerator
import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression
import org.apache.spark.mllib.linalg.{DenseVector, Vector}

object Autoregression {
  def fitModel(ts: Vector, maxLag: Int, noIntercept: Boolean = false): ARModel = {
    // This is loosely based off of the implementation in statsmodels:

    // Make left hand side
    val Y = toBreeze(ts)(maxLag until ts.size)
    // Make lagged right hand side
    val X = Lag.lagMatTrimBoth(ts, maxLag)

    val regression = new OLSMultipleLinearRegression()
    regression.setNoIntercept(noIntercept) // drop intercept in regression
    regression.newSampleData(Y.toArray, matToRowArrs(X))
    val params = regression.estimateRegressionParameters()
    val (c, coeffs) = if (noIntercept) (0.0, params) else (params.head, params.tail)
    new ARModel(c, coeffs)

class ARModel(val c: Double, val coefficients: Array[Double]) extends TimeSeriesModel {

  def this(c: Double, coef: Double) = this(c, Array(coef))

  def removeTimeDependentEffects(
      ts: Vector,
      destTs: Vector = null): Vector = {
    val dest = if (destTs == null) new Array[Double](ts.size) else destTs.toArray
    var i = 0
    while (i < ts.size) {
      dest(i) = ts(i) - c
      var j = 0
      while (j < coefficients.length && i - j - 1 >= 0) {
        dest(i) -= ts(i - j - 1) * coefficients(j)
        j += 1
      i += 1
    new DenseVector(dest)

  def addTimeDependentEffects(ts: Vector, destTs: Vector): Vector = {
    val dest = if (destTs == null) new Array[Double](ts.size) else destTs.toArray
    var i = 0
    while (i < ts.size) {
      dest(i) = c + ts(i)
      var j = 0
      while (j < coefficients.length && i - j - 1 >= 0) {
        dest(i) += dest(i - j - 1) * coefficients(j)
        j += 1
      i += 1
    new DenseVector(dest)

  def sample(n: Int, rand: RandomGenerator): Vector = {
    val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian()))
    addTimeDependentEffects(vec, vec)
Example 105
Source File: PythonConnector.scala    From spark-timeseries   with Apache License 2.0 5 votes vote down vote up
import java.time._

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.mllib.linalg.{DenseVector, Vector}

import{PairFunction, Function}

import PythonConnector._

private object PythonConnector {
  val INT_SIZE = 4
  val DOUBLE_SIZE = 8
  val LONG_SIZE = 8

  def putVector(buf: ByteBuffer, vec: Vector): Unit = {
    var i = 0
    while (i < vec.size) {
      i += 1
  def arrayListToSeq(list: java.util.ArrayList[Any]): Seq[Any] = {
    // implement with ArrayBuffer
    var result = ArrayBuffer[Any]()
    if (list != null) {
      result = ArrayBuffer[Any](list.toArray: _*)

private class BytesToKeyAndSeries extends PairFunction[Array[Byte], String, Vector] {
  override def call(arr: Array[Byte]): (String, Vector) = {
    val buf = ByteBuffer.wrap(arr)
    val keySize = buf.getInt()
    val keyBytes = new Array[Byte](keySize)

    val seriesSize = buf.getInt()
    val series = new Array[Double](seriesSize)
    var i = 0
    while (i < seriesSize) {
      series(i) = buf.getDouble()
      i += 1
    (new String(keyBytes, "UTF8"), new DenseVector(series))

private class KeyAndSeriesToBytes extends Function[(String, Vector), Array[Byte]] {
  override def call(keyVec: (String, Vector)): Array[Byte] = {
    val keyBytes = keyVec._1.getBytes("UTF-8")
    val vec = keyVec._2
    val arr = new Array[Byte](INT_SIZE + keyBytes.length + INT_SIZE + DOUBLE_SIZE * vec.size)
    val buf = ByteBuffer.wrap(arr)
    putVector(buf, vec)

private class InstantToBytes extends Function[(ZonedDateTime, Vector), Array[Byte]] {
  override def call(instant: (ZonedDateTime, Vector)): Array[Byte] = {
    val arr = new Array[Byte](LONG_SIZE + INT_SIZE + DOUBLE_SIZE * instant._2.size)
    val buf = ByteBuffer.wrap(arr)
    putVector(buf, instant._2)
Example 106
Source File: Example.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.{SparkConf, SparkContext}
import breeze.linalg.{DenseMatrix => BDM, _}

object Example {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/train.format", 8)
    val data = => line.split(",")).map(arr =>
      .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784)))))

    val topology = new CNNTopology
    topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4)))
    val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16)
    val start = System.nanoTime()
    println("Training time: " + (System.nanoTime() - start) / 1e9)

    val right = =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    println(s"Predicting precision: $right " + right.toDouble/(data.count()))

//    val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8)
//      .map(line => line.split(",")).map(arr =>
//      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))))

    val rightM = =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count()))

  def Vector2Tensor(record: Vector): Array[BDM[Double]] = {
    val mapSize = new Scale(28, 28)
    val m = new BDM[Double](mapSize.x, mapSize.y)
    var i: Int = 0
    while (i < mapSize.x) {
      var j: Int = 0
      while (j < mapSize.y) {
        m(i, j) = record(mapSize.x * i + j)
        j += 1
      i += 1

Example 107
import breeze.linalg.{DenseMatrix => BDM}
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.{SparkContext, SparkConf}

object CNNDriver {

  def main(args: Array[String]) {

    val myLayers = new Array[Layer](8)
    myLayers(0) = new ConvolutionalLayer(1, 6, kernelSize = new MapSize(5, 5), inputMapSize = new MapSize(28, 28))
    myLayers(1) = new FunctionalLayer(new SigmoidFunction())
    myLayers(2) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(24, 24))
    myLayers(3) = new ConvolutionalLayer(6, 12, new MapSize(5, 5), new MapSize(12, 12))
    myLayers(4) = new FunctionalLayer(new SigmoidFunction())
    myLayers(5) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(8, 8))
    myLayers(6) = new ConvolutionalLayer(12, 12, new MapSize(4, 4), new MapSize(4, 4))
    myLayers(7) = new FunctionalLayer(new SigmoidFunction())
    val topology = FeedForwardTopology(myLayers)

    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/train.format", 8)
    val data = => line.split(",")).map(arr =>
      .map(arr => {
      val target = new Array[Double](12)
      target(arr(784).toInt) = 1
      val in = Vector2BDM(Vectors.dense(arr.slice(0, 784)))
      (Vectors.fromBreeze(in.toDenseVector), Vectors.dense(target))

    val feedForwardTrainer = new FeedForwardTrainer(topology, 784, 12)

    feedForwardTrainer.setStackSize(4) // CNN does not benefit from the stacked data
//    .LBFGSOptimizer.setNumIterations(20)
      .setUpdater(new CNNUpdater(0.85))

    for(iter <- 1 to 1000){
      val start = System.nanoTime()
      val mlpModel = feedForwardTrainer.train(data)

      println(s"Training time $iter: " + (System.nanoTime() - start) / 1e9)

      // predict
      val right = data.filter(v => mlpModel.predict(v._1).argmax == v._2.argmax).count()
      val precision = right.toDouble / data.count()
      println(s"right: $right, count: ${data.count()}, precision: $precision")

  def Vector2BDM(record: Vector): BDM[Double] = {
    val mapSize = new MapSize(28, 28)
    val m = new BDM[Double](mapSize.x, mapSize.y)
    var i: Int = 0
    while (i < mapSize.x) {
      var j: Int = 0
      while (j < mapSize.y) {
        m(i, j) = record(mapSize.x * i + j)
        j += 1
      i += 1

Example 108
Source File: CNNUpdater.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.optimization.Updater

private[ann] class CNNUpdater(alpha: Double) extends Updater {

  override def compute(
      weightsOld: Vector,
      gradient: Vector,
      stepSize: Double,
      iter: Int,
      regParam: Double): (Vector, Double) = {
    val thisIterStepSize = stepSize
    val brzWeights: BV[Double] = weightsOld.toBreeze.toDenseVector
    Baxpy(-thisIterStepSize, gradient.toBreeze * alpha, brzWeights)
    (Vectors.fromBreeze(brzWeights), 0)
Example 109
Source File: SVDExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object SVDExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SVDExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 5 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V  // The V factor is a local dense matrix.
    // $example off$
    val collect = U.rows.collect()
    println("U factor is:")
    collect.foreach { vector => println(vector) }
    println(s"Singular values are: $s")
    println(s"V factor is:\n$V")
// scalastyle:on println 
Source File: BisectingKMeansExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

// scalastyle:off println
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
// $example off$

object BisectingKMeansExample {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
    val sc = new SparkContext(sparkConf)

    // $example on$
    // Loads and parses data
    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()

    // Clustering the data into 6 clusters by BisectingKMeans.
    val bkm = new BisectingKMeans().setK(6)
    val model =

    // Show the compute cost and the cluster centers
    println(s"Compute Cost: ${model.computeCost(data)}")
    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
      println(s"Cluster Center ${idx}: ${center}")
    // $example off$

// scalastyle:on println 
Source File: Normalizer.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}

  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.

Source File: GaussianMixtureModelWrapper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import scala.collection.JavaConverters

import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  val gaussians: Array[Byte] = {
    val modelGaussians = { gaussian =>
      Array[Any](, gaussian.sigma)

  def predictSoft(point: Vector): Vector = {

  def save(sc: SparkContext, path: String): Unit =, path)
Example 113
Source File: Word2VecModelWrapper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(
    val words =
    List(words, similarity).map(_.asInstanceOf[Object]).asJava

  def getVectors: JMap[String, JList[Float]] = { { case (k, v) =>
      (k, v.toList.asJava)

  def save(sc: SparkContext, path: String): Unit =, path)
Source File: PearsonCorrelation.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
    val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]]
    val n = cov.cols

    // Compute the standard deviation on the diagonals first
    var i = 0
    while (i < n) {
      // TODO remove once covariance numerical issue resolved.
      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
      i +=1

    // Loop through columns since cov is column major
    var j = 0
    var sigma = 0.0
    var containNaN = false
    while (j < n) {
      sigma = cov(j, j)
      i = 0
      while (i < j) {
        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
          containNaN = true
        } else {
          cov(i, j) / (sigma * cov(i, i))
        cov(i, j) = corr
        cov(j, i) = corr
        i += 1
      j += 1

    // put 1.0 on the diagonals
    i = 0
    while (i < n) {
      cov(i, i) = 1.0
      i +=1

    if (containNaN) {
      logWarning("Pearson correlation matrix contains NaN values.")


  private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = {
    math.abs(value) <= threshold
Source File: SpearmanCorrelation.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD

  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) =>
        ((j, v), uid)
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = { uid =>
          (uid, (preCol, averageRank))
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
        } else {
          cachedUids += uid
      } ++ flush()
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
Example 116
package org.apache.spark.mllib.classification.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SparkSession}

    def loadData(sc: SparkContext, path: String, modelClass: String): Data = {
      val dataPath = Loader.dataPath(path)
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val dataRDD =
      val dataArray ="weights", "intercept", "threshold").take(1)
      assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath")
      val data = dataArray(0)
      assert(data.size == 3, s"Unable to load $modelClass data from: $dataPath")
      val (weights, intercept) = data match {
        case Row(weights: Vector, intercept: Double, _) =>
          (weights, intercept)
      val threshold = if (data.isNullAt(2)) {
      } else {
      Data(weights, intercept, threshold)
Source File: LabeledPoint.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException

  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
Source File: GLMRegressionModel.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SparkSession}

    def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = {
      val dataPath = Loader.dataPath(path)
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val dataRDD =
      val dataArray ="weights", "intercept").take(1)
      assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath")
      val data = dataArray(0)
      assert(data.size == 2, s"Unable to load $modelClass data from: $dataPath")
      data match {
        case Row(weights: Vector, intercept: Double) =>
          assert(weights.size == numFeatures, s"Expected $numFeatures features, but" +
            s" found ${weights.size} features when loading $modelClass weights from $dataPath")
          Data(weights, intercept)

Source File: IDFSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

Source File: PCASuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect() { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
Example 121
Source File: MllibHelper.scala    From twitter-stream-ml   with GNU General Public License v3.0 5 votes vote down vote up
package com.giorgioinf.twtml.spark

import java.text.Normalizer
import org.apache.spark.Logging
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import scala.math.BigDecimal
import twitter4j.Status

object MllibHelper extends Logging {

  val numNumberFeatures = 4

  var numRetweetBegin = 100
  var numRetweetEnd = 1000
  var numTextFeatures = 1000
  var hashText = new HashingTF(numTextFeatures)
  var numFeatures = numTextFeatures + numNumberFeatures
  var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

  def reset(conf:ConfArguments) {
    numRetweetBegin = conf.numRetweetBegin
    numRetweetEnd = conf.numRetweetEnd
    numTextFeatures = conf.numTextFeatures

    var hashText = new HashingTF(numTextFeatures)
    var numFeatures = numTextFeatures + numNumberFeatures
    var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

    log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures")

  def featurizeText(statuses: Status): SparseVector = {
    val text = statuses.getRetweetedStatus

    // Separate accents from characters and then remove non-unicode
    // characters
    val noAccentText = Normalizer
      .normalize(text, Normalizer.Form.NFD)
      .replaceAll("\\p{M}", "")

    // bigrams

  def featurizeNumbers(statuses: Status): Vector = {
    val user = statuses.getRetweetedStatus.getUser
    val created = statuses.getRetweetedStatus.getCreatedAt
    val timeLeft = (System.currentTimeMillis - created.getTime)

      user.getFollowersCount * Math.pow(10, -12),
      user.getFavouritesCount * Math.pow(10, -12),
      user.getFriendsCount * Math.pow(10, -12),
      timeLeft * Math.pow(10, -14)

  def featurize(statuses: Status): LabeledPoint = {
    val textFeatures = featurizeText(statuses)
    val numberFeatures = featurizeNumbers(statuses)
    val features = Vectors.sparse(
      textFeatures.indices ++ numberFeatureIndices,
      textFeatures.values ++ numberFeatures.toArray
    LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features )

  def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = {
    val n = statuses.getRetweetedStatus.getRetweetCount
    (n >= start && n <= end)

  def filtrate(statuses: Status): Boolean = {
      statuses.isRetweet &&
      //statuses.getLang == "en" &&
      retweetInterval(statuses, numRetweetBegin, numRetweetEnd)
Example 122
Source File: SimpleTextClassificationPipeline.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import scala.beans.BeanInfo

import org.apache.spark.{SparkConf, SparkContext}
import{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}

case class LabeledDocument(id: Long, text: String, label: Double)

case class Document(id: Long, text: String)

object SimpleTextClassificationPipeline {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    // Prepare training documents, which are labeled.
    val training = sc.parallelize(Seq(
      LabeledDocument(0L, "a b c d e spark", 1.0),
      LabeledDocument(1L, "b d", 0.0),
      LabeledDocument(2L, "spark f g h", 1.0),
      LabeledDocument(3L, "hadoop mapreduce", 0.0)))

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    val tokenizer = new Tokenizer()
    val hashingTF = new HashingTF()
    val lr = new LogisticRegression()
    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, lr))

    // Fit the pipeline to training documents.
    val model =

    // Prepare test documents, which are unlabeled.
    val test = sc.parallelize(Seq(
      Document(4L, "spark i j k"),
      Document(5L, "l m n"),
      Document(6L, "spark hadoop spark"),
      Document(7L, "apache hadoop")))

    // Make predictions on test documents.
      .select("id", "text", "probability", "prediction")
      .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
        println(s"($id, $text) --> prob=$prob, prediction=$prediction")

Example 123
Source File: DatasetExample.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib


import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext, DataFrame}

object DatasetExample {

  case class Params(
      input: String = "data/mllib/sample_libsvm_data.txt",
      dataFormat: String = "libsvm") extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DatasetExample") {
      head("Dataset: an example app using DataFrame as a Dataset for ML.")
        .text(s"input path to dataset")
        .action((x, c) => c.copy(input = x))
        .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>

    parser.parse(args, defaultParams).map { params =>
    }.getOrElse {

  def run(params: Params) {

    val conf = new SparkConf().setAppName(s"DatasetExample with $params")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._  // for implicit conversions

    // Load input data
    val origData: RDD[LabeledPoint] = params.dataFormat match {
      case "dense" => MLUtils.loadLabeledPoints(sc, params.input)
      case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input)
    println(s"Loaded ${origData.count()} instances from file: ${params.input}")

    // Convert input data to DataFrame explicitly.
    val df: DataFrame = origData.toDF()
    println(s"Inferred schema:\n${df.schema.prettyJson}")
    println(s"Converted to DataFrame with ${df.count()} records")

    // Select columns
    val labelsDf: DataFrame ="label")
    val labels: RDD[Double] = { case Row(v: Double) => v }
    val numLabels = labels.count()
    val meanLabel = labels.fold(0.0)(_ + _) / numLabels
    println(s"Selected label column with average value $meanLabel")

    val featuresDf: DataFrame ="features")
    val features: RDD[Vector] = { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(feat),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    val tmpDir = Files.createTempDir()
    val outputDir = new File(tmpDir, "dataset").toString
    println(s"Saving to $outputDir as Parquet file.")

    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDataset =

    println(s"Schema from Parquet: ${newDataset.schema.prettyJson}")
    val newFeatures ="features").map { case Row(v: Vector) => v }
    val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(feat),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}")


Source File: IDF.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.annotation.Experimental
import{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StructType

  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: DataFrame): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val idf = udf { vec: Vector => idfModel.transform(vec) }
    dataset.withColumn($(outputCol), idf(col($(inputCol))))

  override def copy(extra: ParamMap): IDFModel = {
    val copied = new IDFModel(uid, idfModel)
    copyValues(copied, extra)
Example 125
Source File: BinaryClassificationEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.annotation.Experimental
import{Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.DoubleType

  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "areaUnderROC")

  override def evaluate(dataset: DataFrame): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT)
    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)

    // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2.
    val scoreAndLabels =$(rawPredictionCol), $(labelCol))
      .map { case Row(rawPrediction: Vector, label: Double) =>
        (rawPrediction(1), label)
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    val metric = $(metricName) match {
      case "areaUnderROC" =>
      case "areaUnderPR" =>
      case other =>
        throw new IllegalArgumentException(s"Does not support metric $other.")

Example 126
Source File: Normalizer.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}

  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.size
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.size
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
Example 127
Source File: PearsonCorrelation.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.Logging
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
    val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]]
    val n = cov.cols

    // Compute the standard deviation on the diagonals first
    var i = 0
    while (i < n) {
      // TODO remove once covariance numerical issue resolved.
      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
      i +=1

    // Loop through columns since cov is column major
    var j = 0
    var sigma = 0.0
    var containNaN = false
    while (j < n) {
      sigma = cov(j, j)
      i = 0
      while (i < j) {
        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
          containNaN = true
        } else {
          cov(i, j) / (sigma * cov(i, i))
        cov(i, j) = corr
        cov(j, i) = corr
        i += 1
      j += 1

    // put 1.0 on the diagonals
    i = 0
    while (i < n) {
      cov(i, i) = 1.0
      i +=1

    if (containNaN) {
      logWarning("Pearson correlation matrix contains NaN values.")


  private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = {
    math.abs(value) <= threshold
Source File: SpearmanCorrelation.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Logging
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD

  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) =>
        ((j, v), uid)
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = { uid =>
          (uid, (preCol, averageRank))
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
        } else {
          cachedUids += uid
      } ++ flush()
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
Source File: GLMClassificationModel.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.classification.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SQLContext}

    def loadData(sc: SparkContext, path: String, modelClass: String): Data = {
      val datapath = Loader.dataPath(path)
      val sqlContext = new SQLContext(sc)
      val dataRDD =
      val dataArray ="weights", "intercept", "threshold").take(1)
      assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath")
      val data = dataArray(0)
      assert(data.size == 3, s"Unable to load $modelClass data from: $datapath")
      val (weights, intercept) = data match {
        case Row(weights: Vector, intercept: Double, _) =>
          (weights, intercept)
      val threshold = if (data.isNullAt(2)) {
      } else {
      Data(weights, intercept, threshold)

Source File: LabeledPoint.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException

  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
Example 131
Source File: GLMRegressionModel.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

    def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = {
      val datapath = Loader.dataPath(path)
      val sqlContext = new SQLContext(sc)
      val dataRDD =
      val dataArray ="weights", "intercept").take(1)
      assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath")
      val data = dataArray(0)
      assert(data.size == 2, s"Unable to load $modelClass data from: $datapath")
      data match {
        case Row(weights: Vector, intercept: Double) =>
          assert(weights.size == numFeatures, s"Expected $numFeatures features, but" +
            s" found ${weights.size} features when loading $modelClass weights from $datapath")
          Data(weights, intercept)

Source File: OneHotEncoderSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import{AttributeGroup, BinaryAttribute, NominalAttribute}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col

class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext {

  def stringIndexed(): DataFrame = {
    val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
    val df = sqlContext.createDataFrame(data).toDF("id", "label")
    val indexer = new StringIndexer()

  test("params") {
    ParamsSuite.checkParams(new OneHotEncoder)

  test("OneHotEncoder dropLast = false") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
    val encoded = encoder.transform(transformed)

    val output ="id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1), vec(2))
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
      (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
    assert(output === expected)

  test("OneHotEncoder dropLast = true") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
    val encoded = encoder.transform(transformed)

    val output ="id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1))
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0),
      (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0))
    assert(output === expected)

  test("input column with ML attribute") {
    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size")
      .select(col("size").as("size", attr.toMetadata()))
    val encoder = new OneHotEncoder()
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("size_is_small").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("size_is_medium").withIndex(1))

  test("input column without ML attribute") {
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index")
    val encoder = new OneHotEncoder()
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("index_is_0").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("index_is_1").withIndex(1))
Source File: Word2VecSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel}

class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new Word2Vec)
    val model = new Word2VecModel("w2v", new OldWord2VecModel(Map("a" -> Array(0.0f))))

  test("Word2Vec") {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val sentence = "a b " * 100 + "a c " * 10
    val numOfWords = sentence.split(" ").size
    val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))

    val codes = Map(
      "a" -> Array(-0.2811822295188904, -0.6356269121170044, -0.3020961284637451),
      "b" -> Array(1.0309048891067505, -1.29472815990448, 0.22276712954044342),
      "c" -> Array(-0.08456747233867645, 0.5137411952018738, 0.11731560528278351)

    val expected = { sentence =>
      Vectors.dense(, word2) => { case (v1, v2) => v1 + v2 }
      ).map(_ / numOfWords))

    val docDF ="text", "expected")

    val model = new Word2Vec()

    model.transform(docDF).select("result", "expected").collect().foreach {
      case Row(vector1: Vector, vector2: Vector) =>
        assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is different with expected.")
Example 134
import org.apache.spark.{SparkException, SparkFunSuite}
import{AttributeGroup, NominalAttribute, NumericAttribute}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.col

class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new VectorAssembler)

  test("assemble") {
    assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty))
    assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0)))
    val dv = Vectors.dense(2.0, 0.0)
    assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0)))
    val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0))
    assert(assemble(0.0, dv, 1.0, sv) ===
      Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0)))
    for (v <- Seq(1, "a", null)) {
      intercept[SparkException](assemble(1.0, v))

  test("assemble should compress vectors") {
    val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0))
    val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0)))

  test("VectorAssembler") {
    val df = sqlContext.createDataFrame(Seq(
      (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L)
    )).toDF("id", "x", "y", "name", "z", "n")
    val assembler = new VectorAssembler()
      .setInputCols(Array("x", "y", "z", "n"))
    assembler.transform(df).select("features").collect().foreach {
      case Row(v: Vector) =>
        assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0)))

  test("ML attributes") {
    val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari")
    val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0)
    val user = new AttributeGroup("user", Array(
      NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"),
    val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0)))
    val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad")
        col("browser").as("browser", browser.toMetadata()),
        col("hour").as("hour", hour.toMetadata()),
        col("count"), // "count" is an integer column without ML attribute
        col("user").as("user", user.toMetadata()),
        col("ad")) // "ad" is a vector column without ML attribute
    val assembler = new VectorAssembler()
      .setInputCols(Array("browser", "hour", "count", "user", "ad"))
    val output = assembler.transform(df)
    val schema = output.schema
    val features = AttributeGroup.fromStructField(schema("features"))
    assert(features.size === 7)
    val browserOut = features.getAttr(0)
    assert(browserOut === browser.withIndex(0).withName("browser"))
    val hourOut = features.getAttr(1)
    assert(hourOut === hour.withIndex(1).withName("hour"))
    val countOut = features.getAttr(2)
    assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2))
    val userGenderOut = features.getAttr(3)
    assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3))
    val userSalaryOut = features.getAttr(4)
    assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4))
    assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5))
    assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6))
Source File: PolynomialExpansionSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.scalatest.exceptions.TestFailedException

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new PolynomialExpansion)

  test("Polynomial expansion with default parameter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())

    val twoDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)),
      Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29),
      Vectors.dense(new Array[Double](9)),
      Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
      Vectors.sparse(9, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()

    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)

  test("Polynomial expansion with setter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())

    val threeDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8),
        Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)),
      Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17),
      Vectors.dense(new Array[Double](19)),
      Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8,
        -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0),
      Vectors.sparse(19, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()

    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
Source File: IDFSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { {
      case data: DenseVector =>
        val res = { case (x, y) => x * y }
      case data: SparseVector =>
        val res = { case (id, value) =>
          (id, value * model(id))
        Vectors.sparse(data.size, res)

  test("params") {
    ParamsSuite.checkParams(new IDF)
    val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0)))

  test("compute IDF with default parameter") {
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((numOfData + 1.0) / (x + 1.0))
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame("features", "expected")

    val idfModel = new IDF()

    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")

  test("compute IDF with setter") {
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame("features", "expected")

    val idfModel = new IDF()

    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
Source File: NormalizerSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext {

  @transient var data: Array[Vector] = _
  @transient var dataFrame: DataFrame = _
  @transient var normalizer: Normalizer = _
  @transient var l1Normalized: Array[Vector] = _
  @transient var l2Normalized: Array[Vector] = _

  override def beforeAll(): Unit = {

    data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
      Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
      Vectors.sparse(3, Seq())
    l1Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.12765957, -0.23404255, -0.63829787),
      Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
      Vectors.dense(0.625, 0.07894737, 0.29605263),
      Vectors.sparse(3, Seq())
    l2Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.184549876, -0.3383414, -0.922749378),
      Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
      Vectors.dense(0.897906166, 0.113419726, 0.42532397),
      Vectors.sparse(3, Seq())

    val sqlContext = new SQLContext(sc)
    dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData))
    normalizer = new Normalizer()

  def collectResult(result: DataFrame): Array[Vector] = {"normalized_features").collect().map {
      case Row(features: Vector) => features

  def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
      case (v1: DenseVector, v2: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after normalization.")

  def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
    assert((lhs, rhs).zipped.forall { (vector1, vector2) =>
      vector1 ~== vector2 absTol 1E-5
    }, "The vector value is not correct after normalization.")

  test("Normalization with default parameter") {
    val result = collectResult(normalizer.transform(dataFrame))

    assertTypeOfVector(data, result)

    assertValues(result, l2Normalized)

  test("Normalization with setter") {

    val result = collectResult(normalizer.transform(dataFrame))

    assertTypeOfVector(data, result)

    assertValues(result, l1Normalized)

private object NormalizerSuite {
  case class FeatureData(features: Vector)
Example 138
Source File: HashingTFSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.Utils

  test("params") {
    ParamsSuite.checkParams(new HashingTF)

  test("hashingTF") {
    val df = sqlContext.createDataFrame(Seq(
      (0, "a a b b c d".split(" ").toSeq)
    )).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
    val output = hashingTF.transform(df)
    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
    require(attrGroup.numAttributes === Some(n))
    val features ="features").first().getAs[Vector](0)
    // Assume perfect hash on "a", "b", "c", and "d".
    def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n)
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
    assert(features ~== expected absTol 1e-14)
Source File: IDFSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

Source File: SparkXGBoostClassifierSuite.scala    From sparkxgboost   with Apache License 2.0 5 votes vote down vote up
package rotationsymmetry.sxgboost

import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.functions.udf
import org.scalatest.FunSuite
import rotationsymmetry.sxgboost.loss.LogisticLoss
import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext

class SparkXGBoostClassifierSuite extends FunSuite with TestData with MLlibTestSparkContext {

  test("test with simple data") {
    val rawdata = Seq(
      LabeledPoint(0, Vectors.dense(0.0, 0.0)),
      LabeledPoint(0, Vectors.dense(0.0, 0.0)),
      LabeledPoint(1, Vectors.dense(0.0, 0.0)),

      LabeledPoint(1, Vectors.dense(1.0, 0.0)),
      LabeledPoint(1, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0, Vectors.dense(1.0, 0.0)),

      LabeledPoint(1, Vectors.dense(0.0, 1.0)),
      LabeledPoint(1, Vectors.dense(0.0, 1.0)),
      LabeledPoint(0, Vectors.dense(0.0, 1.0)),

      LabeledPoint(0, Vectors.dense(1.0, 1.0)),
      LabeledPoint(0, Vectors.dense(1.0, 1.0)),
      LabeledPoint(1, Vectors.dense(1.0, 1.0))

    val data = sqlContext.createDataFrame(sc.parallelize(rawdata, 2))

    val truthUDF = udf { feature: Vector =>
      if (feature(0) == feature(1))

    val dataWithTruth = data.withColumn("truth", truthUDF(data("features")))

    val featureIndexer = new VectorIndexer()

    val sparkXGBoostClassifier = new SparkXGBoostClassifier(new LogisticLoss)
    val sparkXGBoostPipeline = new Pipeline()
      .setStages(Array(featureIndexer, sparkXGBoostClassifier))
    val sXGBoostModel =

    val evaluator = new MulticlassClassificationEvaluator()

    val precision = evaluator.evaluate(sXGBoostModel.transform(dataWithTruth))

    assert(precision === 1.0)
Source File: BasicStatistics.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrix

    val sc: SparkContext = null
    val seriesX: RDD[Double] = null // a series 一系列
    val seriesY: RDD[Double] = null // must have the same number of partitions and cardinality as seriesX

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
    // method is not specified, Pearson's method will be used by default. 
    val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
    val data: RDD[Vector] = null // note that each Vector is a row and not a column 
     //spearman 斯皮尔曼相关性
    // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default. 
    val correlMatrix: Matrix = Statistics.corr(data, "pearson")

Example 142
Source File: LogisticRegressionWithLBFGSExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.mllib.linalg.{ Vector, Vectors }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors

    val points = Array(
    val spiderRDD = sc.parallelize(points)
    val lr = new LogisticRegressionWithLBFGS().setIntercept(true)
    val model =
    val predict = model.predict(Vectors.dense(0.938))
Example 143
import org.apache.spark.annotation.Experimental
import{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StructType

  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: DataFrame): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val idf = udf { vec: Vector => idfModel.transform(vec) }
    dataset.withColumn($(outputCol), idf(col($(inputCol))))

  override def transformSchema(schema: StructType): StructType = {

  override def copy(extra: ParamMap): IDFModel = {
    val copied = new IDFModel(uid, idfModel)
    copyValues(copied, extra).setParent(parent)
Example 144
Source File: DCT.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.sql.types.DataType

  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")

  override protected def outputDataType: DataType = new VectorUDT
Example 145
Source File: PCA.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StructField, StructType}

  override def transform(dataset: DataFrame): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val pcaOp = udf { pcaModel.transform _ }
    dataset.withColumn($(outputCol), pcaOp(col($(inputCol))))

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
      s"Input column ${$(inputCol)} must be a vector column")
      s"Output column ${$(outputCol)} already exists.")
    val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false)

  override def copy(extra: ParamMap): PCAModel = {
    val copied = new PCAModel(uid, pcaModel)
    copyValues(copied, extra).setParent(parent)
Source File: BinaryClassificationEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.annotation.Experimental
import{Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.DoubleType

  def setLabelCol(value: String): this.type = set(labelCol, value)
  setDefault(metricName -> "areaUnderROC")

  override def evaluate(dataset: DataFrame): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT)
    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)

    // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2.
    val scoreAndLabels =$(rawPredictionCol), $(labelCol))
      .map { case Row(rawPrediction: Vector, label: Double) =>
        (rawPrediction(1), label)
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    val metric = $(metricName) match {
      case "areaUnderROC" => metrics.areaUnderROC()
      case "areaUnderPR" => metrics.areaUnderPR()

  override def isLargerBetter: Boolean = $(metricName) match {
    case "areaUnderROC" => true//ROC曲线下面积为1.0时表示一个完美的分类器,0.5则表示一个随机的性能
    case "areaUnderPR" => true //准确率与召回率

  override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra)
Source File: Normalizer.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}

  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.size
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.size
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.

Source File: ChiSqSelector.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import scala.collection.mutable.ArrayBuilder

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD

  def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
    val indices = Statistics.chiSqTest(data)
      .zipWithIndex.sortBy { case (res, _) => -res.statistic }
      .map { case (_, indices) => indices }
    new ChiSqSelectorModel(indices)
Example 149
Source File: GaussianMixtureModelWrapper.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrix}
import org.apache.spark.mllib.clustering.GaussianMixtureModel

  val gaussians: JList[Object] = {
    val modelGaussians = model.gaussians
    var i = 0
    var mu = ArrayBuffer.empty[Vector]
    var sigma = ArrayBuffer.empty[Matrix]
    while (i < k) {
      mu += modelGaussians(i).mu
      sigma += modelGaussians(i).sigma
      i += 1
    List(mu.toArray, sigma.toArray).map(_.asInstanceOf[Object]).asJava

  def save(sc: SparkContext, path: String): Unit =, path)
Source File: Word2VecModelWrapper.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{ArrayList => JArrayList, List => JList, Map => JMap}
import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = {

  def findSynonyms(word: String, num: Int): JList[Object] = {
    val vec = transform(word)
    findSynonyms(vec, num)

  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    val result = model.findSynonyms(vector, num)
    val similarity = Vectors.dense(
    val words =
    List(words, similarity).map(_.asInstanceOf[Object]).asJava

  def getVectors: JMap[String, JList[Float]] = {{case (k, v) => (k, v.toList.asJava)}).asJava

  def save(sc: SparkContext, path: String): Unit =, path)
Source File: PearsonCorrelation.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.Logging
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
    val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]]
    val n = cov.cols

    // Compute the standard deviation on the diagonals first
    var i = 0
    while (i < n) {
      // TODO remove once covariance numerical issue resolved.
      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
      i +=1

    // Loop through columns since cov is column major
    var j = 0
    var sigma = 0.0
    var containNaN = false
    while (j < n) {
      sigma = cov(j, j)
      i = 0
      while (i < j) {
        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
          containNaN = true
        } else {
          cov(i, j) / (sigma * cov(i, i))
        cov(i, j) = corr
        cov(j, i) = corr
        i += 1
      j += 1

    // put 1.0 on the diagonals
    i = 0
    while (i < n) {
      cov(i, i) = 1.0
      i +=1

    if (containNaN) {
      logWarning("Pearson correlation matrix contains NaN values.")


  private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = {
    math.abs(value) <= threshold
Source File: SpearmanCorrelation.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Logging
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD

  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) =>
        ((j, v), uid)
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = { uid =>
          (uid, (preCol, averageRank))
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
        } else {
          cachedUids += uid
      } ++ flush()
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
Source File: GLMClassificationModel.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.classification.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SQLContext}

    def loadData(sc: SparkContext, path: String, modelClass: String): Data = {
      val datapath = Loader.dataPath(path)
      val sqlContext = SQLContext.getOrCreate(sc)
      val dataRDD =
      val dataArray ="weights", "intercept", "threshold").take(1)
      assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath")
      val data = dataArray(0)
      assert(data.size == 3, s"Unable to load $modelClass data from: $datapath")
      val (weights, intercept) = data match {
        case Row(weights: Vector, intercept: Double, _) =>
          (weights, intercept)
      val threshold = if (data.isNullAt(2)) {
      } else {
      Data(weights, intercept, threshold)

Source File: LabeledPoint.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException

  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
Example 155
Source File: GLMRegressionModel.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

    def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = {
      val datapath = Loader.dataPath(path)
      val sqlContext = SQLContext.getOrCreate(sc)
      val dataRDD =
      val dataArray ="weights", "intercept").take(1)
      assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath")
      val data = dataArray(0)
      assert(data.size == 2, s"Unable to load $modelClass data from: $datapath")
      data match {
        case Row(weights: Vector, intercept: Double) =>
          assert(weights.size == numFeatures, s"Expected $numFeatures features, but" +
            s" found ${weights.size} features when loading $modelClass weights from $datapath")
          Data(weights, intercept)

Source File: VectorSlicerSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {//参数
    val slicer = new VectorSlicer
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {

  test("feature validity checks") {//特征有效性检查
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(!validIndices(Array(1, 2, 1)))
    assert(validNames(Array("a", "b")))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))

  test("Test vector slicer") {//测试向量机
    val sqlContext = new SQLContext(sc)

    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize( { case (a, b) => Row(a, b) }
    val df = sqlContext.createDataFrame(rdd,
        StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))
    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {"result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) { case (a, b) =>
        assert(a === b)

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))//transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame

    //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame

Source File: DCTSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)
  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true)
    } else {
      (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true)
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = sqlContext.createDataFrame(Seq(
      DCTTestData(data, expectedResult)

    val transformer = new DCT()
      .select("resultVec", "wantedVec")
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
Example 158
Source File: MinMaxScalerSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SQLContext}

        assert(vector1.equals(vector2), "Transformed vector is different with expected.")

    // copied model must have the same parent.
  test("MinMaxScaler arguments max must be larger than min") {
    withClue("arguments max must be larger than min") {
      intercept[IllegalArgumentException] {
        val scaler = new MinMaxScaler().setMin(10).setMax(0)
      intercept[IllegalArgumentException] {
        val scaler = new MinMaxScaler().setMin(0).setMax(0)
Source File: PolynomialExpansionSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import org.scalatest.exceptions.TestFailedException

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {//参数
    ParamsSuite.checkParams(new PolynomialExpansion)

  test("Polynomial expansion with default parameter") {//带有默认参数的多项式展开
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())

    val twoDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)),
      Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29),
      Vectors.dense(new Array[Double](9)),
      Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
      Vectors.sparse(9, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()
    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
  test("Polynomial expansion with setter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())

    val threeDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8),
        Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)),
      Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17),
      Vectors.dense(new Array[Double](19)),
      Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8,
        -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0),
      Vectors.sparse(19, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()
    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
Source File: IDFSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

       assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")

  test("compute IDF with setter") {//设置IDF计算
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame("features", "expected")

    val idfModel = new IDF()
    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
Source File: NormalizerSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext {

  @transient var data: Array[Vector] = _
  @transient var dataFrame: DataFrame = _
  @transient var normalizer: Normalizer = _
  @transient var l1Normalized: Array[Vector] = _
  @transient var l2Normalized: Array[Vector] = _

  override def beforeAll(): Unit = {

    data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
      Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
      Vectors.sparse(3, Seq())
     l1Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.12765957, -0.23404255, -0.63829787),
      Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
      Vectors.dense(0.625, 0.07894737, 0.29605263),
      Vectors.sparse(3, Seq())
    l2Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.184549876, -0.3383414, -0.922749378),
      Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
      Vectors.dense(0.897906166, 0.113419726, 0.42532397),
      Vectors.sparse(3, Seq())

    val sqlContext = new SQLContext(sc)
    dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData))
    normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized_features")
  def collectResult(result: DataFrame): Array[Vector] = {"normalized_features").collect().map {
      case Row(features: Vector) => features
  def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
    assert((lhs, rhs).zipped.forall {
      case (v1: DenseVector, v2: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after normalization.")
  def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
    assert((lhs, rhs).zipped.forall { (vector1, vector2) =>
      vector1 ~== vector2 absTol 1E-5
    }, "The vector value is not correct after normalization.")

  test("Normalization with default parameter") {//默认参数的正常化
    val result = collectResult(normalizer.transform(dataFrame))

    assertTypeOfVector(data, result)

    assertValues(result, l2Normalized)

  test("Normalization with setter") {//规范化设置
    val result = collectResult(normalizer.transform(dataFrame))

    assertTypeOfVector(data, result)

    assertValues(result, l1Normalized)

Example 162
Source File: ProbabilisticClassifierSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}

final class TestProbabilisticClassificationModel(
    override val uid: String,
    override val numClasses: Int)
  extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] {

  override def copy(extra: this.type = defaultCopy(extra)

  override protected def predictRaw(input: Vector): Vector = {

  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {

  def friendlyPredict(input: Vector): Double = {

class ProbabilisticClassifierSuite extends SparkFunSuite {

  test("test thresholding") {//测试阈值
    val thresholds = Array(0.5, 0.2)
    val testModel = new TestProbabilisticClassificationModel("myuid", 2).setThresholds(thresholds)
    assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 1.0))) === 1.0)
    assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 0.2))) === 0.0)

    val testModel = new TestProbabilisticClassificationModel("myuid", 2)
    assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 2.0))) === 1.0)
Source File: FeatureExtraction.scala    From meetup-stream   with Apache License 2.0 5 votes vote down vote up
package transformations

import core._
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object FeatureExtraction  {
  val localDictionary=Source
  def breakToWords(description: String)={
    val wordSelector="""[^\<\>\/]\b([a-zA-Z\d]{4,})\b""".r
    (wordSelector findAllIn description).map{_.trim.toLowerCase()}
  def eventToVector(dictionary: Map[String, Int], description: String): Option[Vector]={
  def popularWords(words: Iterator[String])={
    val initialWordCounts=collection.mutable.Map[String, Int]()
    val wordCounts=words.
        case(wordCounts, word)=> wordCounts+Tuple2(word,wordCounts.getOrElse(word,0)+1)
    val wordsIndexes=wordCounts     
        case(word, count)=>dictionary.get(word).map{index=>(index,count.toDouble)}
    val topWords=wordsIndexes.toSeq.sortBy(-1*_._2).take(10)
   val wordsIterator = breakToWords(description)
   val topWords=popularWords(wordsIterator)   
   if (topWords.size==10) Some(Vectors.sparse(dictionary.size,topWords)) else None
Source File: SVDExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object SVDExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SVDExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val rows = sc.parallelize(data)

    val mat: RowMatrix = new RowMatrix(rows)

    // Compute the top 5 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
    val s: Vector = svd.s     // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V     // The V factor is a local dense matrix.
    // $example off$
    val collect = U.rows.collect()
    println("U factor is:")
    collect.foreach { vector => println(vector) }
    println(s"Singular values are: $s")
    println(s"V factor is:\n$V")

Example 165
Source File: BisectingKMeansExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

// scalastyle:off println
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
// $example off$

object BisectingKMeansExample {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
    val sc = new SparkContext(sparkConf)

    // $example on$
    // Loads and parses data
    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()

    // Clustering the data into 6 clusters by BisectingKMeans.
    val bkm = new BisectingKMeans().setK(6)
    val model =

    // Show the compute cost and the cluster centers
    println(s"Compute Cost: ${model.computeCost(data)}")
    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
      println(s"Cluster Center ${idx}: ${center}")
    // $example off$

Example 166
Source File: Normalizer.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}

  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.

Source File: GaussianMixtureModelWrapper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import scala.collection.JavaConverters

import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  val gaussians: Array[Byte] = {
    val modelGaussians = { gaussian =>
      Array[Any](, gaussian.sigma)

  def predictSoft(point: Vector): Vector = {

  def save(sc: SparkContext, path: String): Unit =, path)
Source File: Word2VecModelWrapper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}

  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(
    val words =
    List(words, similarity).map(_.asInstanceOf[Object]).asJava

  def getVectors: JMap[String, JList[Float]] = { { case (k, v) =>
      (k, v.toList.asJava)

  def save(sc: SparkContext, path: String): Unit =, path)
Source File: PearsonCorrelation.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD

  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
    val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]]
    val n = cov.cols

    // Compute the standard deviation on the diagonals first
    var i = 0
    while (i < n) {
      // TODO remove once covariance numerical issue resolved.
      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
      i +=1

    // Loop through columns since cov is column major
    var j = 0
    var sigma = 0.0
    var containNaN = false
    while (j < n) {
      sigma = cov(j, j)
      i = 0
      while (i < j) {
        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
          containNaN = true
        } else {
          cov(i, j) / (sigma * cov(i, i))
        cov(i, j) = corr
        cov(j, i) = corr
        i += 1
      j += 1

    // put 1.0 on the diagonals
    i = 0
    while (i < n) {
      cov(i, i) = 1.0
      i +=1

    if (containNaN) {
      logWarning("Pearson correlation matrix contains NaN values.")


  private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = {
    math.abs(value) <= threshold
Source File: SpearmanCorrelation.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD

  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => { case (v, j) =>
        ((j, v), uid)
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      val cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = { uid =>
          (uid, (preCol, averageRank))
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
        } else {
          cachedUids += uid
      } ++ flush()
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
Source File: GLMClassificationModel.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.classification.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SparkSession}

    def loadData(sc: SparkContext, path: String, modelClass: String): Data = {
      val dataPath = Loader.dataPath(path)
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val dataRDD =
      val dataArray ="weights", "intercept", "threshold").take(1)
      assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath")
      val data = dataArray(0)
      assert(data.size == 3, s"Unable to load $modelClass data from: $dataPath")
      val (weights, intercept) = data match {
        case Row(weights: Vector, intercept: Double, _) =>
          (weights, intercept)
      val threshold = if (data.isNullAt(2)) {
      } else {
Example 172
Source File: LabeledPoint.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.SparkException
import org.apache.spark.annotation.Since
import{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser

  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
Source File: GLMRegressionModel.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.regression.impl

import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.Loader
import org.apache.spark.sql.{Row, SparkSession}

    def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = {
      val dataPath = Loader.dataPath(path)
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val dataRDD =
      val dataArray ="weights", "intercept").take(1)
      assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath")
      val data = dataArray(0)
      assert(data.size == 2, s"Unable to load $modelClass data from: $dataPath")
      data match {
        case Row(weights: Vector, intercept: Double) =>
          assert(weights.size == numFeatures, s"Expected $numFeatures features, but" +
            s" found ${weights.size} features when loading $modelClass weights from $dataPath")
          Data(weights, intercept)

Source File: IDFSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model =
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    // Transforms local vectors
    val localTfidf =

Source File: PCASuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect() { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)

  test("memory cost computation") {
    assert(PCAUtil.memoryCost(10, 100) < Int.MaxValue)
    // check overflowing
    assert(PCAUtil.memoryCost(40000, 60000) > Int.MaxValue)
Source File: FeaturesParser.scala    From spark-anomaly-detection   with MIT License 5 votes vote down vote up

import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object FeaturesParser{
  def parseFeatures(rawdata: RDD[String]): RDD[Vector] = {
    val rdd: RDD[Array[Double]] =",").map(_.toDouble))
    val vectors: RDD[Vector] = => Vectors.dense(arrDouble))

  def parseFeaturesWithLabel(cvData: RDD[String]): RDD[LabeledPoint] = {
    val rdd: RDD[Array[Double]] =",").map(_.toDouble))
    val labeledPoints = => new LabeledPoint(arrDouble(0), Vectors.dense(arrDouble.slice(1, arrDouble.length))))
Example 177
Source File: AnomalyDetection$Test.scala    From spark-anomaly-detection   with MIT License 5 votes vote down vote up

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.scalactic.Equality
import org.scalatest.{FlatSpec, FunSuite, Matchers}

class AnomalyDetection$Test extends FlatSpec with Matchers with SharedSparkContext {
    val point = Vectors.dense(Array(14.8593411857427, 14.9006647394062))
    val means = Vectors.dense(Array(14.1122257839456, 14.9977105081362))
    val variances = Vectors.dense(Array(1.83263141349452, 1.70974533082878))

    "probFunction" should "return correct product value" in {
      val p = AnomalyDetection.probFunction(point, means, variances)
      assert(p === 0.0769984879544 +- 0.0001)

    "predict" should "predict the anomaly" in {
      assert(!AnomalyDetection.predict(point, means, variances, 0.05))

    "predict" should "predict non anomaly" in {
      assert(AnomalyDetection.predict(point, means, variances, 0.08))

  private def vectorequality() = {
    new Equality[Vector] {
      def areEqual(a: Vector, b: Any): Boolean =
        b match {
          case v: Vector => => pair._1 === pair._2 +- 0.001).reduce((a, b) => a && b)
          case _ => false

  def trainModel(): AnomalyDetectionModel = {
    val trainingExamplesFilePath = "./src/test/resources/training.csv"
    val trainingData = sc.textFile(trainingExamplesFilePath, 2).cache()
    val trainingRdd = FeaturesParser.parseFeatures(trainingData)
    new AnomalyDetection().run(trainingRdd)

  "run" should "return model with correct mean and variance" in {
    val model: AnomalyDetectionModel = trainModel()

    //use scalactic's more relaxing equality
    implicit val vectorEq = vectorequality()

    assert(model.means === Vectors.dense(Array(79.9843751617201, 5.13662727300755)))
    assert(model.variances === Vectors.dense(Array(356.44539323536225, 3.79818173645375)))

  "optimize" should "calculate epsilon and F1 score" in {
    val cvFilePath = "./src/test/resources/cross_val.csv"
    val cvData = sc.textFile(cvFilePath, 2).cache()
    val cvPointsRdd: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData)

    val model = trainModel()
    val optimalModel = new AnomalyDetection().optimize(cvPointsRdd, model)
    assert(optimalModel.epsilon === 3.382218E-4 +- 0.0000000001)

Source File: ROC.scala    From s4ds   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.{SparkConf, SparkContext}
import{HashingTF, Tokenizer, StringIndexer}
import{ParamGridBuilder, CrossValidator}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

import breeze.linalg._
import breeze.plot._
import org.jfree.chart.axis.NumberTickUnit

object ROC extends App {

  val conf = new SparkConf().setAppName("ROC")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)
  import sqlContext._
  import sqlContext.implicits._

  val transformedTest ="transformedTest.parquet")

  val labelScores ="probability", "label").map {
    case Row(probability:Vector, label:Double) => (probability(1), label)

  val bm = new BinaryClassificationMetrics(labelScores, 300)
  val roc = bm.roc.collect
  roc.foreach { println }

  val falsePositives = { _._1 }
  val truePositives = { _._2 }

  val f = Figure()
  val p = f.subplot(0)
  p += plot(falsePositives, truePositives)
  p.xlabel = "false positives"
  p.ylabel = "true positives"
  p.xlim = (0.0, 0.1)
  p.xaxis.setTickUnit(new NumberTickUnit(0.01))
  p.yaxis.setTickUnit(new NumberTickUnit(0.1))

Example 179
Source File: LogisticRegressionDemo.scala    From s4ds   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.{SparkConf, SparkContext}
import{HashingTF, Tokenizer, StringIndexer}
import{ParamGridBuilder, CrossValidator}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.SaveMode

case class LabelledDocument(fileName:String, text:String, category:String)

object LogisticRegressionDemo extends App {

  val conf = new SparkConf().setAppName("LrTest")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)
  import sqlContext._
  import sqlContext.implicits._

  val spamText = sc.wholeTextFiles("spam/*")
  val hamText = sc.wholeTextFiles("ham/*")

  val spamDocuments = { 
    case (fileName, text) => LabelledDocument(fileName, text, "spam")
  val hamDocuments = {
    case (fileName, text) => LabelledDocument(fileName, text, "ham")

  val documentsDF = spamDocuments.union(hamDocuments).toDF

  val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3))

  val indexer = new StringIndexer().setInputCol("category").setOutputCol("label")
  val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
  val hasher = new HashingTF().setInputCol("words").setOutputCol("features")
  val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0)

  val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr))
  val model =

  val transformedTrain = model.transform(trainDF)
  val transformedTest = model.transform(testDF)

  println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count,
    " / ",transformedTrain.count)
  println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count,
    " / ",transformedTest.count)"fileName", "label", "prediction", "probability")
    .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet")"fileName", "label", "prediction", "probability")
Source File: DataFrameExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println


import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

object DataFrameExample {

  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DataFrameExample") {
      head("DataFrameExample: an example app using DataFrame for ML.")
        .text(s"input path to dataframe")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>

    parser.parse(args, defaultParams).map { params =>
    }.getOrElse {

  def run(params: Params) {

    val conf = new SparkConf().setAppName(s"DataFrameExample with $params")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // Load input data
    println(s"Loading LIBSVM file with UDT from ${params.input}.")
    val df: DataFrame ="libsvm").load(params.input).cache()
    println("Schema from LIBSVM:")
    println(s"Loaded training data as a DataFrame with ${df.count()} records.")

    // Show statistical summary of labels.
    val labelSummary = df.describe("label")

    // Convert features column to an RDD of vectors.
    val features ="features").map { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(feat),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    // Save the records in a parquet file.
    val tmpDir = Files.createTempDir()
    val outputDir = new File(tmpDir, "dataframe").toString
    println(s"Saving to $outputDir as Parquet file.")

    // Load the records back.
    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDF =
    println(s"Schema from Parquet:")

