Example 1
Source File: Kinship.scala    From seqspark   with Apache License 2.0 5 votes vote down vote up
package org.dizhang.seqspark.stat

import org.apache.spark.rdd.RDD
import org.dizhang.seqspark.ds._
import breeze.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.SparkContext

import scala.collection.mutable.ArrayBuffer

  def removeNums(size: Int, nums: IndexedSeq[Int]): IndexedSeq[Int] = {
    var j: Int = 0
    var i: Int = 0
    val res = ArrayBuffer[Int]()
    while (i < size) {
      if (j >= nums.length) {
      } else if (i == nums(j)) {
        j += 1
      } else {
      i += 1

Example 2
Source File: LibSvmTest.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.extra.libsvm

import breeze.linalg.SparseVector
import com.spotify.scio.testing.PipelineSpec

class LibSvmTest extends PipelineSpec {
  val expected = List(
    (0.0, SparseVector[Double](34)((0, 1), (8, 1), (18, 1), (20, 1), (23, 1), (33, 1))),
    (1.0, SparseVector[Double](34)((2, 1), (8, 1), (18, 1), (20, 1), (29, 1), (33, 1))),
    (0.0, SparseVector[Double](34)((0, 1), (8, 1), (19, 1), (20, 1), (23, 1), (33, 1)))

  val data = List(
    "0 1:1 9:1 19:1 21:1 24:1 34:1",
    "1 3:1 9:1 19:1 21:1 30:1 34:1",
    "0 1:1 9:1 20:1 21:1 24:1 34:1"

  "libSVMCollection" should "parse libsvm files" in {
    runWithContext { sc =>
      val res = libSVMCollection(sc.parallelize(data))
      res should containInAnyOrder(expected)

  it should "parse libsvm files with length" in {
    runWithContext { sc =>
      val res = libSVMCollection(sc.parallelize(data), 34)
      res should containInAnyOrder(expected)
Example 3
Source File: BreezeSpec.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.extra

import breeze.linalg.{DenseMatrix, DenseVector, SparseVector}
import breeze.stats.distributions.Rand
import com.spotify.scio.extra.Breeze._
import com.twitter.algebird.Semigroup
import org.scalacheck._

trait BreezeSpec[M[_], T] extends PropertySpec {
  val dimension = 10
  val rows = 20
  val cols = 10
  val fRand =
  val m: Gen[M[T]]
  def ms: Gen[List[M[T]]] = Gen.listOf[M[T]](m)
  def plus(x: M[T], y: M[T])(implicit sg: Semigroup[M[T]]): M[T] =, y)
  def sumOption(xs: Iterable[M[T]])(implicit sg: Semigroup[M[T]]): Option[M[T]] = sg.sumOption(xs)

class FloatDenseVectorSpec extends BreezeSpec[DenseVector, Float] {
  val m = Gen.const(dimension).map(DenseVector.rand[Float](_, fRand))

  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))

class DoubleDenseVectorSpec extends BreezeSpec[DenseVector, Double] {
  val m = Gen.const(dimension).map(DenseVector.rand[Double](_))
  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))

class FloatDenseMatrixSpec extends BreezeSpec[DenseMatrix, Float] {
  val m = Gen.const((rows, cols)).map {
    case (r, c) => DenseMatrix.rand[Float](r, c, fRand)
  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))

class DoubleDenseMatrixSpec extends BreezeSpec[DenseMatrix, Double] {
  val m = Gen.const((rows, cols)).map {
    case (r, c) => DenseMatrix.rand[Double](r, c)
  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))

class FloatSparseVectorSpec extends BreezeSpec[SparseVector, Float] {
  val m = Gen
    .map(d => SparseVector(DenseVector.rand[Float](d, fRand).data))

  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))

class DoubleSparseVectorSpec extends BreezeSpec[SparseVector, Double] {
  val m = Gen
    .map(d => SparseVector(DenseVector.rand[Double](d).data))

  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))
Example 4
Source File: PassiveAggressiveBinaryModelEvaluation.scala    From flink-parameter-server   with Apache License 2.0 5 votes vote down vote up

import breeze.linalg.{DenseVector, SparseVector}
import org.slf4j.LoggerFactory

class PassiveAggressiveBinaryModelEvaluation

object PassiveAggressiveBinaryModelEvaluation {

  private val log = LoggerFactory.getLogger(classOf[PassiveAggressiveBinaryModelEvaluation])

  def accuracy(model: DenseVector[Double],
               testLines: Traversable[(SparseVector[Double], Option[Boolean])],
               featureCount: Int,
               pac: PassiveAggressiveBinaryAlgorithm): Double = {

    var tt = 0
    var ff = 0
    var tf = 0
    var ft = 0
    var cnt = 0
    testLines.foreach { case (vector, label) => label match {
      case Some(lab) =>
        val real = lab
        val predicted = pac.predict(vector, model)
        (real, predicted) match {
          case (true, true) => tt +=1
          case (false, false) => ff +=1
          case (true, false) => tf +=1
          case (false, true) => ft +=1
        cnt += 1
      case _ => throw new IllegalStateException("Labels shold not be missing.")
    val percent = ((tt + ff).toDouble / cnt) * 100


Example 5
Source File: PassiveAggressiveMultiModelEvaluation.scala    From flink-parameter-server   with Apache License 2.0 5 votes vote down vote up

import breeze.linalg.{DenseMatrix, SparseVector}
import org.slf4j.LoggerFactory

class PassiveAggressiveMultiModelEvaluation

object PassiveAggressiveMultiModelEvaluation {

  private val log = LoggerFactory.getLogger(classOf[PassiveAggressiveMultiModelEvaluation])

  def accuracy(model: DenseMatrix[Double], testLines: Traversable[(SparseVector[Double], Option[Int])],
               featureCount: Int, pac: PassiveAggressiveMulticlassAlgorithm): Double = {

    var hit = 0
    var cnt = 0
    testLines.foreach{case(vector, label) => label match {
      case Some(l) =>
      if (pac.predict(vector, model) == l) hit += 1
      cnt += 1
      case _ => throw new IllegalStateException("Labels should not be missing.")
    val percent = (hit.toDouble / cnt) * 100

Example 6
Source File: driver.scala    From proxcocoa   with Apache License 2.0 5 votes vote down vote up
package l1distopt

import breeze.linalg.SparseVector
import l1distopt.utils._
import l1distopt.solvers._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkConf}

object driver {

  def main(args: Array[String]) {

    val options = { arg =>
      arg.dropWhile(_ == '-').split('=') match {
        case Array(opt, v) => (opt -> v)
        case Array(opt) => (opt -> "true")
        case _ => throw new IllegalArgumentException("Invalid argument: " + arg)

    // read in inputs
    val master = options.getOrElse("master", "local[4]")
    val trainFile = options.getOrElse("trainFile", "")
    val numFeatures = options.getOrElse("numFeatures", "0").toInt
    val numSplits = options.getOrElse("numSplits", "1").toInt
    val testFile = options.getOrElse("testFile", "")
    // algorithm-specific inputs
    val eta = options.getOrElse("eta", "1.0").toDouble // elastic net parameter: 1.0 = lasso, 0.0 = ridge regression
    val lambda = options.getOrElse("lambda", "0.01").toDouble // regularization parameter
    val numRounds = options.getOrElse("numRounds", "200").toInt // number of outer iterations, called T in the paper
    val localIterFrac = options.getOrElse("localIterFrac", "1.0").toDouble; // fraction of local points to be processed per round, H = localIterFrac * n
    val debugIter = options.getOrElse("debugIter", "10").toInt // set to -1 to turn off debugging output
    val seed = options.getOrElse("seed", "0").toInt // set seed for debug purposes

    // print out inputs
    println("master:       " + master);          println("trainFile:    " + trainFile);
    println("numFeatures:  " + numFeatures);     println("numSplits:    " + numSplits);      
    println("testfile:     " + testFile);        println("eta           " + eta);       
    println("lambda:       " + lambda);          println("numRounds:    " + numRounds);       
    println("localIterFrac:" + localIterFrac);   println("debugIter     " + debugIter);       
    println("seed          " + seed);            

    // start spark context
    val conf = new SparkConf().setMaster(master)
    val sc = new SparkContext(conf)


    val finalAlphaCoCoA = ProxCoCoAp.runProxCoCoAp(data, labels, params, debug)
Example 7
Source File: NearestNeighbors.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package utils

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import com.github.fommil.netlib.BLAS
import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import scala.collection.mutable.ArrayBuffer

object NearestNeighbors {

	def runNearestNeighbors(data: RDD[Array[(LabeledPoint,Int,Int)]], 
		kNN: Int, 
		sampleData: Array[(LabeledPoint,Int,Int)]): Array[(String,Array[((Int,Int),Double)])] = {
		val globalNearestNeighborsByIndex = data.mapPartitionsWithIndex(localNearestNeighbors(_,_,kNN,sampleData)).groupByKey().map(x => (x._1,x._2.toArray.sortBy(r => r._2).take(kNN))).collect()		


	private def localNearestNeighbors(partitionIndex: Long,
		iter: Iterator[Array[(LabeledPoint,Int,Int)]],
		kNN: Int,
		sampleData: Array[(LabeledPoint,Int,Int)]): Iterator[(String,((Int,Int),Double))] = { 
			var result = List[(String,((Int,Int),Double))]()
			val dataArr =
			val nLocal = dataArr.size - 1			
			val sampleDataSize = sampleData.size - 1

			val kLocalNeighbors = Array.fill[distanceIndex](sampleDataSize+1)(null)
			for {
			    i1 <- 0 to sampleDataSize
			kLocalNeighbors(i1) = distanceIndex(sampleData(i1)._3.toInt, sampleData(i1)._2.toInt, DenseVector.zeros[Double](kNN) + Int.MaxValue.toDouble, DenseVector.zeros[Int](kNN))

			for (i <- 0 to nLocal) {
				val currentPoint = dataArr(i)
				val features = currentPoint._1.features
				val rowId = currentPoint._3.toInt	
				for (j <- 0 to sampleDataSize) {
					val samplePartitionId = sampleData(j)._2
					val sampleRowId = sampleData(j)._3
					val sampleFeatures = sampleData(j)._1.features
					if (!((rowId == sampleRowId) & (samplePartitionId == partitionIndex))) {
						val distance = Math.sqrt(sum((sampleFeatures - features) :* (sampleFeatures - features)))
						if (distance < max(kLocalNeighbors(j).distanceVector)) {
							val indexToReplace = argmax(kLocalNeighbors(j).distanceVector)
							kLocalNeighbors(j).distanceVector(indexToReplace) = distance
							kLocalNeighbors(j).neighborRowId(indexToReplace) = rowId
			for (m <- 0 to sampleDataSize){
				for (l <-0 to kNN-1) {
					val key = kLocalNeighbors(m).partitionId.toString+","+kLocalNeighbors(m).sampleRowId.toString
					val tup = (partitionIndex.toInt,kLocalNeighbors(m).neighborRowId(l))
Example 8
Source File: loadData.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package utils

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object loadData {

 	def readDelimitedData(sc: SparkContext, path: String, numFeatures: Int, delimiter: String, numPartitions: Int): RDD[(LabeledPoint,Int,Int)] = {
		val data = sc.textFile(path).filter{x => x.split(delimiter)(0).toDouble == 1.0}.repartition(numPartitions).mapPartitions{x => Iterator(x.toArray)}
		val formatData = data.mapPartitionsWithIndex{(partitionId,iter) =>
			var result = List[(LabeledPoint,Int,Int)]()
			val dataArray =
			val dataArraySize = dataArray.size - 1
			var rowCount = dataArraySize
			for (i <- 0 to dataArraySize) {
				val parts = dataArray(i).split(delimiter)
				rowCount = rowCount - 1

Example 9
Source File: SMOTE.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package SMOTE

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import com.github.fommil.netlib.BLAS
import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import scala.collection.mutable.ArrayBuffer
import utils._

object SMOTE {

	def runSMOTE(sc: SparkContext, 
		inPath: String, 
		outPath: String,
		numFeatures: Int,  
		oversamplingPctg: Double,
        kNN: Int,
		delimiter: String,
        numPartitions: Int): Unit = {

		val rand = new Random()

		val data = loadData.readDelimitedData(sc, inPath, numFeatures, delimiter, numPartitions)
		val dataArray = data.mapPartitions(x => Iterator(x.toArray)).cache()

        val numObs = => x.size).reduce(_+_)

		println("Number of Filtered Observations "+numObs.toString)		

		val roundPctg = oversamplingPctg
        val sampleData = dataArray.flatMap(x => x).sample(withReplacement = false, fraction = roundPctg, seed = 1L).collect().sortBy(r => (r._2,r._3)) //without Replacement

		println("Sample Data Count "+sampleData.size.toString)

	 	val globalNearestNeighbors = NearestNeighbors.runNearestNeighbors(dataArray, kNN, sampleData)
        var randomNearestNeighbor = => (x._1.split(",")(0).toInt,x._1.split(",")(1).toInt,x._2(rand.nextInt(kNN)))).sortBy(r => (r._1,r._2))
        var sampleDataNearestNeighbors = => (x._1._3._1._1, x._1._2, x._1._3._1._2, x._2._1))

		val syntheticData = dataArray.mapPartitionsWithIndex(createSyntheticData(_,_,sampleDataNearestNeighbors,delimiter)).persist()
		println("Synthetic Data Count "+syntheticData.count.toString)
		val newData = syntheticData.union(sc.textFile(inPath))
		println("New Line Count "+newData.count.toString)

	private def createSyntheticData(partitionIndex: Long,
		iter: Iterator[Array[(LabeledPoint,Int,Int)]],
		sampleDataNN: Array[(Int,Int,Int,LabeledPoint)],
		delimiter: String): Iterator[String]  = {
			var result = List[String]()
			val dataArr =
			val nLocal = dataArr.size - 1			
			val sampleDataNNSize = sampleDataNN.size - 1
			val rand = new Random()			

			for (j <- 0 to sampleDataNNSize){
				val partitionId = sampleDataNN(j)._1
				val neighborId = sampleDataNN(j)._3
				val sampleFeatures = sampleDataNN(j)._4.features
				if (partitionId == partitionIndex.toInt){
					val currentPoint = dataArr(neighborId)	
					val features = currentPoint._1.features	
					sampleFeatures += (sampleFeatures - features) * rand.nextDouble
Example 10
Source File: TensorLDAModelTest.scala    From spectrallda-tensorspark   with Apache License 2.0 5 votes vote down vote up
package edu.uci.eecs.spectralLDA.algorithm

import breeze.linalg.{DenseMatrix, DenseVector, SparseVector, norm}
import breeze.numerics.abs
import org.scalatest._
import org.apache.spark.SparkContext
import edu.uci.eecs.spectralLDA.testharness.Context

class TensorLDAModelTest extends FlatSpec with Matchers {

  private val sc: SparkContext = Context.getSparkContext

  "Multinomial log-likelihood" should "be correct" in {
    val p = DenseVector[Double](0.2, 0.5, 0.3)
    val x1 = DenseVector[Double](20, 50, 30)
    val x2 = DenseVector[Double](40, 40, 20)

    abs(TensorLDAModel.multinomialLogLikelihood(p, x1) - (-4.697546)) should be <= 1e-6
    abs(TensorLDAModel.multinomialLogLikelihood(p, x2) - (-15.42038)) should be <= 1e-6
Example 11
Source File: GibbsSample.scala    From glintlda   with MIT License 5 votes vote down vote up
package glintlda

import breeze.linalg.{DenseVector, SparseVector, sum}
import glintlda.util.FastRNG

  def apply(sv: SparseVector[Int], random: FastRNG, topics: Int): GibbsSample = {
    val totalTokens = sum(sv)
    val sample = new GibbsSample(new Array[Int](totalTokens), new Array[Int](totalTokens))

    var i = 0
    var current = 0
    while (i < sv.activeSize) {
      val index = sv.indexAt(i)
      var value = sv.valueAt(i)
      while (value > 0) {
        sample.features(current) = index
        sample.topics(current) = random.nextPositiveInt() % topics
        current += 1
        value -= 1
      i += 1


Example 12
Source File: ScoreTest.scala    From seqspark   with Apache License 2.0 5 votes vote down vote up
package org.dizhang.seqspark.stat

import breeze.linalg.{*, CSCMatrix, DenseMatrix, DenseVector, SparseVector}
import org.dizhang.seqspark.stat.HypoTest.NullModel.{Fitted => SNM}
import org.dizhang.seqspark.util.General._

object ScoreTest {

  def apply(nm: SNM, x: CSCMatrix[Double]): ScoreTest = {
    Sparse(nm, x)

  def apply(nm: SNM, x: DenseMatrix[Double]): ScoreTest = {
    Dense(nm, x)

  def apply(nm: SNM, x: DenseVector[Double]): ScoreTest = {
    Dense(nm, DenseVector.horzcat(x))

  def apply(nm: SNM, x: SparseVector[Double]): ScoreTest = {
    Sparse(nm, SparseVector.horzcat(x))

  def apply(nm: SNM,
            x1: DenseMatrix[Double],
            x2: CSCMatrix[Double]): ScoreTest = {
    Mixed(nm, x1, x2)

  case class Sparse(nm: SNM,
                    x: CSCMatrix[Double]) extends ScoreTest {
    val score = (nm.residuals.toDenseMatrix * x).toDenseVector / nm.a
    lazy val variance = {
      val c = nm.xs
      val IccInv = nm.invInfo * nm.a
      val Igg = (colMultiply(x, nm.b).t * x).toDense
      val Icg = (c(::, *) *:* nm.b).t * x
      val Igc = Icg.t
      (Igg - Igc * IccInv * Icg) / nm.a

  case class Dense(nm: SNM,
                   x: DenseMatrix[Double]) extends ScoreTest {
    val score = x.t * nm.residuals / nm.a
    lazy val variance = {
      val c = nm.xs
      val IccInv = nm.invInfo * nm.a
      val Igg = (x(::, *) *:* nm.b).t * x
      val Icg = (c(::, *) *:* nm.b).t * x
      val Igc = Icg.t
      (Igg - Igc * IccInv * Icg)/nm.a

  case class Mixed(nm: SNM,
                   x1: DenseMatrix[Double],
                   x2: CSCMatrix[Double]) extends ScoreTest {
    private val dense = Dense(nm, x1)
    private val sparse = Sparse(nm, x2)
    val score = DenseVector.vertcat(dense.score, sparse.score)
    lazy val variance = {
      val v1 = dense.variance
      val v4 = sparse.variance
      val v2 = {
        val c = nm.xs
        val IccInv = nm.invInfo * nm.a
        val Igg = (x1(::, *) *:* nm.b).t * x2
        val Icg = (c(::, *) *:* nm.b).t * x2
        val Igc = x1.t * (c(::, *) *:* nm.b).t
        (Igg - Igc * IccInv * Icg) / nm.a
      val v3 = v2.t
      val v12 = DenseMatrix.horzcat(v1, v2)
      val v34 = DenseMatrix.horzcat(v3, v4)
      DenseMatrix.vertcat(v12, v34)

  case class Mock(score: DenseVector[Double],
                  variance: DenseMatrix[Double]) extends ScoreTest

sealed trait ScoreTest extends HypoTest {
  def score: DenseVector[Double]
  def variance: DenseMatrix[Double]
Example 13
Source File: NewsgroupsPipeline.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.pipelines.text

import breeze.linalg.SparseVector
import keystoneml.evaluation.MulticlassClassifierEvaluator
import keystoneml.loaders.NewsgroupsDataLoader
import keystoneml.nodes.learning.NaiveBayesEstimator
import keystoneml.nodes.nlp._
import keystoneml.nodes.stats.TermFrequency
import keystoneml.nodes.util.{CommonSparseFeatures, MaxClassifier}
import org.apache.spark.{SparkConf, SparkContext}
import keystoneml.pipelines.Logging
import scopt.OptionParser
import keystoneml.workflow.Pipeline

object NewsgroupsPipeline extends Logging {
  val appName = "NewsgroupsPipeline"

  def run(sc: SparkContext, conf: NewsgroupsConfig): Pipeline[String, Int] = {

    val trainData = NewsgroupsDataLoader(sc, conf.trainLocation)
    val numClasses = NewsgroupsDataLoader.classes.length

    // Build the classifier estimator
    logInfo("Training classifier")
    val predictor = Trim andThen
        LowerCase() andThen
        Tokenizer() andThen
        NGramsFeaturizer(1 to conf.nGrams) andThen
        TermFrequency(x => 1) andThen
        (CommonSparseFeatures[Seq[String]](conf.commonFeatures), andThen
        (NaiveBayesEstimator[SparseVector[Double]](numClasses),, trainData.labels) andThen

    // Evaluate the classifier
    logInfo("Evaluating classifier")

    val testData = NewsgroupsDataLoader(sc, conf.testLocation)
    val testLabels = testData.labels
    val testResults = predictor(
    val eval = new MulticlassClassifierEvaluator(numClasses).evaluate(testResults, testLabels)

    logInfo("\n" + eval.summary(NewsgroupsDataLoader.classes))


  case class NewsgroupsConfig(
    trainLocation: String = "",
    testLocation: String = "",
    nGrams: Int = 2,
    commonFeatures: Int = 100000)

  def parse(args: Array[String]): NewsgroupsConfig = new OptionParser[NewsgroupsConfig](appName) {
    head(appName, "0.1")
    opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) }
    opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) }
    opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) }
    opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) }
  }.parse(args, NewsgroupsConfig()).get

  def main(args: Array[String]) = {
    val conf = new SparkConf().setAppName(appName)
    conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit.

    val sc = new SparkContext(conf)

    val appConfig = parse(args)
    run(sc, appConfig)


Example 14
Source File: AmazonReviewsPipeline.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.pipelines.text

import breeze.linalg.SparseVector
import keystoneml.evaluation.BinaryClassifierEvaluator
import keystoneml.loaders.{AmazonReviewsDataLoader, LabeledData}
import keystoneml.nodes.learning.LogisticRegressionEstimator
import keystoneml.nodes.nlp._
import keystoneml.nodes.stats.TermFrequency
import keystoneml.nodes.util.CommonSparseFeatures
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import keystoneml.pipelines.Logging
import scopt.OptionParser
import keystoneml.workflow.Pipeline

object AmazonReviewsPipeline extends Logging {
  val appName = "AmazonReviewsPipeline"

  def run(spark: SparkSession, conf: AmazonReviewsConfig): Pipeline[String, Double] = {
    val amazonTrainData = AmazonReviewsDataLoader(spark, conf.trainLocation, conf.threshold).labeledData
    val trainData = LabeledData(amazonTrainData.repartition(conf.numParts).cache())

    val training =
    val labels = trainData.labels

    // Build the classifier estimator
    val predictor = Trim andThen
        LowerCase() andThen
        Tokenizer() andThen
        NGramsFeaturizer(1 to conf.nGrams) andThen
        TermFrequency(x => 1) andThen
        (CommonSparseFeatures[Seq[String]](conf.commonFeatures), training) andThen
        (LogisticRegressionEstimator[SparseVector[Double]](numClasses = 2, numIters = conf.numIters), training, labels)

    // Evaluate the classifier
    val amazonTestData = AmazonReviewsDataLoader(spark, conf.testLocation, conf.threshold).labeledData
    val testData = LabeledData(amazonTestData.repartition(conf.numParts).cache())
    val testLabels = testData.labels
    val testResults = predictor(
    val eval = BinaryClassifierEvaluator.evaluate( > 0), > 0))

    logInfo("\n" + eval.summary())

  case class AmazonReviewsConfig(
    trainLocation: String = "",
    testLocation: String = "",
    threshold: Double = 3.5,
    nGrams: Int = 2,
    commonFeatures: Int = 100000,
    numIters: Int = 20,
    numParts: Int = 512)

  def parse(args: Array[String]): AmazonReviewsConfig = new OptionParser[AmazonReviewsConfig](appName) {
    head(appName, "0.1")
    opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) }
    opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) }
    opt[Double]("threshold") action { (x,c) => c.copy(threshold=x)}
    opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) }
    opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) }
    opt[Int]("numIters") action { (x,c) => c.copy(numParts=x) }
    opt[Int]("numParts") action { (x,c) => c.copy(numParts=x) }
  }.parse(args, AmazonReviewsConfig()).get

  def main(args: Array[String]) = {
    val conf = new SparkConf().setAppName(appName)
    conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit.

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val appConfig = parse(args)
    run(spark, appConfig)

Example 15
Source File: AllSparseFeatures.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.util

import breeze.linalg.SparseVector
import org.apache.spark.rdd.RDD
import keystoneml.workflow.Estimator

import scala.reflect.ClassTag

case class AllSparseFeatures[T: ClassTag]() extends Estimator[Seq[(T, Double)], SparseVector[Double]] {
  override def fit(data: RDD[Seq[(T, Double)]]): SparseFeatureVectorizer[T] = {
    val featureOccurrences = data.flatMap(
    // zip with unique ids and take the smallest unique id for a given feature to get
    // a deterministic ordering
    val featuresWithUniqueId = featureOccurrences.zipWithUniqueId().reduceByKey {
      (x, y) => Math.min(x, y)
    val featureSpace = featuresWithUniqueId.sortBy(_._2).map(_._1)
    new SparseFeatureVectorizer(featureSpace)
Example 16
Source File: SparseFeatureVectorizer.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.util

import breeze.linalg.SparseVector
import keystoneml.workflow.Transformer

class SparseFeatureVectorizer[T](featureSpace: Map[T, Int]) extends Transformer[Seq[(T, Double)], SparseVector[Double]] {
  private def transformVector(in: Seq[(T, Double)], featureSpaceMap: Map[T, Int]): SparseVector[Double] = {
    val features = => (featureSpaceMap.get(f._1), f._2))
        .map(f => (f._1.get, f._2.toDouble))

  override def apply(in: Seq[(T, Double)]): SparseVector[Double] = {
    transformVector(in, featureSpace)
Example 17
Source File: NGramsHashingTF.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.nlp

import java.lang.Integer.{rotateLeft => rotl}

import breeze.linalg.SparseVector
import keystoneml.workflow.Transformer

import scala.collection.mutable

  private final def avalanche(hash: Int): Int = {
    var h = hash

    h ^= h >>> 16
    h *= 0x85ebca6b
    h ^= h >>> 13
    h *= 0xc2b2ae35
    h ^= h >>> 16


  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)

  def apply(line: Seq[String]): SparseVector[Double] = {
    val hashes = new Array[Integer](line.length)
    var i = 0
    while (i < line.length) {
      hashes(i) = line(i).##
      i += 1

    var j = 0
    val termFrequencies = mutable.HashMap.empty[Int, Double]
    i = 0
    while (i + minOrder <= line.length) {
      var order = minOrder
      var h = seqSeed

      j = i
      while (j < i + minOrder) {
        h = mix(h, hashes(j))
        j += 1

      val feature = nonNegativeMod(finalizeHash(h, order), numFeatures)
      termFrequencies.put(feature, termFrequencies.getOrElse(feature, 0.0) + 1.0)

      order = minOrder + 1
      while (order <= maxOrder && i + order <= line.length) {
        h = mix(h, hashes(i + order - 1))
        val feature = nonNegativeMod(finalizeHash(h, order), numFeatures)
        termFrequencies.put(feature, termFrequencies.getOrElse(feature, 0.0) + 1.0)
        order += 1
      i += 1


Example 18
Source File: HashingTF.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.nlp

import breeze.linalg.SparseVector
import keystoneml.workflow.Transformer

case class HashingTF[T <: Seq[Any]](numFeatures: Int) extends Transformer[T, SparseVector[Double]] {
  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)

  def apply(document: T): SparseVector[Double] = {
    val termFrequencies = scala.collection.mutable.HashMap.empty[Int, Double]
    document.foreach { term =>
      val i = nonNegativeMod(term.##, numFeatures)
      termFrequencies.put(i, termFrequencies.getOrElse(i, 0.0) + 1.0)

Example 19
Source File: MLlibUtils.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.utils

import breeze.linalg.{SparseVector, DenseMatrix, DenseVector}

  def breezeVectorToMLlib(breezeVector: breeze.linalg.Vector[Double]): org.apache.spark.mllib.linalg.Vector = {
    breezeVector match {
      case v: DenseVector[Double] =>
        if (v.offset == 0 && v.stride == 1 && v.length == {
          new org.apache.spark.mllib.linalg.DenseVector(
        } else {
          new org.apache.spark.mllib.linalg.DenseVector(v.toArray)  // Can't use underlying array directly, so make a new one
      case v: SparseVector[Double] =>
        if (v.index.length == v.used) {
          new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index,
        } else {
          new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index.slice(0, v.used),, v.used))
      case v: breeze.linalg.Vector[_] =>
        sys.error("Unsupported Breeze vector type: " + v.getClass.getName)

Example 20
Source File: SparseArray.scala    From scalismo-faces   with Apache License 2.0 5 votes vote down vote up
package scalismo.faces.numerics

import java.util

import breeze.linalg.SparseVector

private[numerics] class SparseArray(var index: Array[Int], var data: Array[Double], var nnz: Int, val length: Int) {
  require(nnz <= length, "too many non zeros")
  require(data.length == index.length, "data and index have different length")
  require(data.length >= nnz, "data array is too short")

  def activeSize = nnz

  def update(i: Int, v: Double): Unit = {
    val offset = findOffset(i)
    if (offset >= 0)
      data(offset) = v
    else {
      val insert = ~offset
      nnz += 1
      if (nnz > index.length) reallocate()

      // insert
      // move right part
      System.arraycopy(index, insert, index, insert + 1, nnz - insert - 1)
      System.arraycopy(data, insert, data, insert + 1, nnz - insert - 1)
      // insert data
      index(insert) = i
      data(insert) = v

  private def reallocate() = {
    val newLength = math.max(nnz + 1, index.length * 2)
    val _index = new Array[Int](newLength)
    val _data = new Array[Double](newLength)
    System.arraycopy(index, 0, _index, 0, index.length)
    System.arraycopy(data, 0, _data, 0, data.length)
    index = _index
    data = _data

  def apply(i: Int): Double = {
    val offset = findOffset(i)
    if (offset >= 0)

  def findOffset(i: Int): Int = util.Arrays.binarySearch(index, 0, nnz, i)

  def toDense: Array[Double] = {
    val arr = new Array[Double](length)
    var i = 0
    while (i < nnz) {
      val ind = index(i)
      arr(ind) = data(i)
      i += 1

object SparseArray {
  def apply(vector: SparseVector[Double]): SparseArray = {
    val nnz = vector.activeSize
    val index = new Array[Int](nnz)
    val data = new Array[Double](nnz)
    System.arraycopy(vector.index, 0, index, 0, nnz)
    System.arraycopy(, 0, data, 0, nnz)
    new SparseArray(index, data, nnz, vector.length)