org.apache.spark.mllib.clustering.GaussianMixture Scala Examples

The following examples show how to use org.apache.spark.mllib.clustering.GaussianMixture. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: GMM.scala    From spark-tda   with Apache License 2.0 6 votes vote down vote up
import java.io.{File, PrintWriter}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.clustering.GaussianMixture
import org.apache.spark.sql.functions._

def computeGaussianMixtureModel(
  pathToTextFile: String,
  quantity: Int) {
  case class Point(x: Double, y: Double)

  def save(f: File)(func: PrintWriter => Unit) {
    val p = new PrintWriter(f)
    try {
      func(p)
    } finally {
      p.close()
    }
  }

  val filename = pathToTextFile.split("\\.")(0)

  val outputFilename = s"$filename-GMM-k${quantity}.tsv"

  val points = sc
    .textFile(pathToTextFile)
    .map {
      line => line.trim.split("\\s+")
    }
    .map {
      row => Point(row(0).toDouble, row(1).toDouble)
    }

  val features = points
    .map {
      p => Vectors.dense(p.x, p.y)
    }

  features.cache()

  val gmm = new GaussianMixture()
    .setK(quantity)
    .run(features)

  val predictions = features
    .map {
      f => (f(0), f(1), gmm.predict(f) + 1)
    }
    .collect

  save(new File(outputFilename)) {
    println(s"OUTPUT TO: ${outputFilename}")
    f => predictions.foreach{
      case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}")
    }
  }
} 
Example 2
Source File: GaussianMixtureExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object GaussianMixtureExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("GaussianMixtureExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/gmm_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using GaussianMixture
    val gmm = new GaussianMixture().setK(2).run(parsedData)

    // Save and load model
    gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")
    val sameModel = GaussianMixtureModel.load(sc,
      "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")

    // output parameters of max-likelihood model
    for (i <- 0 until gmm.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 3
Source File: GaussianMixtureExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object GaussianMixtureExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("GaussianMixtureExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/gmm_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using GaussianMixture
    val gmm = new GaussianMixture().setK(2).run(parsedData)

    // Save and load model
    gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")
    val sameModel = GaussianMixtureModel.load(sc,
      "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")

    // output parameters of max-likelihood model
    for (i <- 0 until gmm.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 4
Source File: GaussianMixtureExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object GaussianMixtureExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("GaussianMixtureExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/gmm_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using GaussianMixture
    val gmm = new GaussianMixture().setK(2).run(parsedData)

    // Save and load model
    gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")
    val sameModel = GaussianMixtureModel.load(sc,
      "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")

    // output parameters of max-likelihood model
    for (i <- 0 until gmm.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 5
Source File: DenseGaussianMixture.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.GaussianMixture
import org.apache.spark.mllib.linalg.Vectors


object DenseGaussianMixture {
  def main(args: Array[String]): Unit = {
    if (args.length < 3) {
      println("usage: DenseGmmEM <input file> <k> <convergenceTol> [maxIterations]")
    } else {
      val maxIterations = if (args.length > 3) args(3).toInt else 100
      run(args(0), args(1).toInt, args(2).toDouble, maxIterations)
    }
  }

  private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
    val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
    val ctx = new SparkContext(conf)

    val data = ctx.textFile(inputFile).map { line =>
      Vectors.dense(line.trim.split(' ').map(_.toDouble))
    }.cache()

    val clusters = new GaussianMixture()
      .setK(k)
      .setConvergenceTol(convergenceTol)
      .setMaxIterations(maxIterations)
      .run(data)

    for (i <- 0 until clusters.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma))
    }

    println("Cluster labels (first <= 100):")
    val clusterLabels = clusters.predict(data)
    clusterLabels.take(100).foreach { x =>
      print(" " + x)
    }
    println()
  }
} 
Example 6
Source File: DenseGaussianMixture.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.GaussianMixture
import org.apache.spark.mllib.linalg.Vectors


object DenseGaussianMixture {
  def main(args: Array[String]): Unit = {
    if (args.length < 3) {
      println("usage: DenseGmmEM <input file> <k> <convergenceTol> [maxIterations]")
    } else {
      val maxIterations = if (args.length > 3) args(3).toInt else 100
      run(args(0), args(1).toInt, args(2).toDouble, maxIterations)
    }
  }

  private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
    val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
    val ctx = new SparkContext(conf)

    val data = ctx.textFile(inputFile).map { line =>
      Vectors.dense(line.trim.split(' ').map(_.toDouble))
    }.cache()

    val clusters = new GaussianMixture()
      .setK(k)//聚类的个数
      .setConvergenceTol(convergenceTol)
      .setMaxIterations(maxIterations)
      .run(data)

    for (i <- 0 until clusters.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma))
    }

    println("Cluster labels (first <= 100):")
    val clusterLabels = clusters.predict(data)
    clusterLabels.take(100).foreach { x =>
      print(" " + x)
    }
    println()
  }
}
// scalastyle:on println 
Example 7
Source File: GaussianMixtureExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object GaussianMixtureExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("GaussianMixtureExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/gmm_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using GaussianMixture
    val gmm = new GaussianMixture().setK(2).run(parsedData)

    // Save and load model
    gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")
    val sameModel = GaussianMixtureModel.load(sc,
      "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")

    // output parameters of max-likelihood model
    for (i <- 0 until gmm.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: DenseGaussianMixture.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.GaussianMixture
import org.apache.spark.mllib.linalg.Vectors


object DenseGaussianMixture {
  def main(args: Array[String]): Unit = {
    if (args.length < 3) {
      println("usage: DenseGmmEM <input file> <k> <convergenceTol> [maxIterations]")
    } else {
      val maxIterations = if (args.length > 3) args(3).toInt else 100
      run(args(0), args(1).toInt, args(2).toDouble, maxIterations)
    }
  }

  private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
    val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
    val ctx = new SparkContext(conf)

    val data = ctx.textFile(inputFile).map { line =>
      Vectors.dense(line.trim.split(' ').map(_.toDouble))
    }.cache()

    val clusters = new GaussianMixture()
      .setK(k)
      .setConvergenceTol(convergenceTol)
      .setMaxIterations(maxIterations)
      .run(data)

    for (i <- 0 until clusters.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma))
    }

    println("Cluster labels (first <= 100):")
    val clusterLabels = clusters.predict(data)
    clusterLabels.take(100).foreach { x =>
      print(" " + x)
    }
    println()
  }
}
// scalastyle:on println