breeze.linalg.SparseVector Scala Examples

The following examples show how to use breeze.linalg.SparseVector. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: Kinship.scala    From seqspark   with Apache License 2.0 5 votes vote down vote up
package org.dizhang.seqspark.stat

import org.apache.spark.rdd.RDD
import org.dizhang.seqspark.ds._
import breeze.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.SparkContext

import scala.collection.mutable.ArrayBuffer


  def removeNums(size: Int, nums: IndexedSeq[Int]): IndexedSeq[Int] = {
    var j: Int = 0
    var i: Int = 0
    val res = ArrayBuffer[Int]()
    while (i < size) {
      if (j >= nums.length) {
        res.+=(i)
      } else if (i == nums(j)) {
        j += 1
      } else {
        res.+=(i)
      }
      i += 1
    }
    res.toIndexedSeq
  }

} 
Example 2
Source File: LibSvmTest.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.extra.libsvm

import breeze.linalg.SparseVector
import com.spotify.scio.testing.PipelineSpec

class LibSvmTest extends PipelineSpec {
  val expected = List(
    (0.0, SparseVector[Double](34)((0, 1), (8, 1), (18, 1), (20, 1), (23, 1), (33, 1))),
    (1.0, SparseVector[Double](34)((2, 1), (8, 1), (18, 1), (20, 1), (29, 1), (33, 1))),
    (0.0, SparseVector[Double](34)((0, 1), (8, 1), (19, 1), (20, 1), (23, 1), (33, 1)))
  )

  val data = List(
    "0 1:1 9:1 19:1 21:1 24:1 34:1",
    "1 3:1 9:1 19:1 21:1 30:1 34:1",
    "0 1:1 9:1 20:1 21:1 24:1 34:1"
  )

  "libSVMCollection" should "parse libsvm files" in {
    runWithContext { sc =>
      val res = libSVMCollection(sc.parallelize(data))
      res should containInAnyOrder(expected)
    }
  }

  it should "parse libsvm files with length" in {
    runWithContext { sc =>
      val res = libSVMCollection(sc.parallelize(data), 34)
      res should containInAnyOrder(expected)
    }
  }
} 
Example 3
Source File: BreezeSpec.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.extra

import breeze.linalg.{DenseMatrix, DenseVector, SparseVector}
import breeze.stats.distributions.Rand
import com.spotify.scio.extra.Breeze._
import com.twitter.algebird.Semigroup
import org.scalacheck._

trait BreezeSpec[M[_], T] extends PropertySpec {
  val dimension = 10
  val rows = 20
  val cols = 10
  val fRand = Rand.uniform.map(_.toFloat)
  val m: Gen[M[T]]
  def ms: Gen[List[M[T]]] = Gen.listOf[M[T]](m)
  def plus(x: M[T], y: M[T])(implicit sg: Semigroup[M[T]]): M[T] = sg.plus(x, y)
  def sumOption(xs: Iterable[M[T]])(implicit sg: Semigroup[M[T]]): Option[M[T]] = sg.sumOption(xs)
}

class FloatDenseVectorSpec extends BreezeSpec[DenseVector, Float] {
  val m = Gen.const(dimension).map(DenseVector.rand[Float](_, fRand))

  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  }
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))
  }
}

class DoubleDenseVectorSpec extends BreezeSpec[DenseVector, Double] {
  val m = Gen.const(dimension).map(DenseVector.rand[Double](_))
  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  }
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))
  }
}

class FloatDenseMatrixSpec extends BreezeSpec[DenseMatrix, Float] {
  val m = Gen.const((rows, cols)).map {
    case (r, c) => DenseMatrix.rand[Float](r, c, fRand)
  }
  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  }
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))
  }
}

class DoubleDenseMatrixSpec extends BreezeSpec[DenseMatrix, Double] {
  val m = Gen.const((rows, cols)).map {
    case (r, c) => DenseMatrix.rand[Double](r, c)
  }
  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  }
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))
  }
}

class FloatSparseVectorSpec extends BreezeSpec[SparseVector, Float] {
  val m = Gen
    .const(dimension)
    .map(d => SparseVector(DenseVector.rand[Float](d, fRand).data))

  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  }
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))
  }
}

class DoubleSparseVectorSpec extends BreezeSpec[SparseVector, Double] {
  val m = Gen
    .const(dimension)
    .map(d => SparseVector(DenseVector.rand[Double](d).data))

  property("plus") {
    forAll(m, m)((x, y) => plus(x, y) == x + y)
  }
  property("sumOption") {
    forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _))
  }
} 
Example 4
Source File: PassiveAggressiveBinaryModelEvaluation.scala    From flink-parameter-server   with Apache License 2.0 5 votes vote down vote up
package hu.sztaki.ilab.ps.test.utils

import breeze.linalg.{DenseVector, SparseVector}
import hu.sztaki.ilab.ps.passive.aggressive.algorithm.PassiveAggressiveBinaryAlgorithm
import org.slf4j.LoggerFactory

class PassiveAggressiveBinaryModelEvaluation

object PassiveAggressiveBinaryModelEvaluation {

  private val log = LoggerFactory.getLogger(classOf[PassiveAggressiveBinaryModelEvaluation])


  def accuracy(model: DenseVector[Double],
               testLines: Traversable[(SparseVector[Double], Option[Boolean])],
               featureCount: Int,
               pac: PassiveAggressiveBinaryAlgorithm): Double = {

    var tt = 0
    var ff = 0
    var tf = 0
    var ft = 0
    var cnt = 0
    testLines.foreach { case (vector, label) => label match {
      case Some(lab) =>
        val real = lab
        val predicted = pac.predict(vector, model)
        (real, predicted) match {
          case (true, true) => tt +=1
          case (false, false) => ff +=1
          case (true, false) => tf +=1
          case (false, true) => ft +=1
        }
        cnt += 1
      case _ => throw new IllegalStateException("Labels shold not be missing.")
    }
    }
    val percent = ((tt + ff).toDouble / cnt) * 100

    percent
  }


} 
Example 5
Source File: PassiveAggressiveMultiModelEvaluation.scala    From flink-parameter-server   with Apache License 2.0 5 votes vote down vote up
package hu.sztaki.ilab.ps.test.utils

import breeze.linalg.{DenseMatrix, SparseVector}
import hu.sztaki.ilab.ps.passive.aggressive.algorithm.PassiveAggressiveMulticlassAlgorithm
import org.slf4j.LoggerFactory

class PassiveAggressiveMultiModelEvaluation

object PassiveAggressiveMultiModelEvaluation {

  private val log = LoggerFactory.getLogger(classOf[PassiveAggressiveMultiModelEvaluation])

  def accuracy(model: DenseMatrix[Double], testLines: Traversable[(SparseVector[Double], Option[Int])],
               featureCount: Int, pac: PassiveAggressiveMulticlassAlgorithm): Double = {

    var hit = 0
    var cnt = 0
    testLines.foreach{case(vector, label) => label match {
      case Some(l) =>
      if (pac.predict(vector, model) == l) hit += 1
      cnt += 1
      case _ => throw new IllegalStateException("Labels should not be missing.")
    }}
    val percent = (hit.toDouble / cnt) * 100
    percent
  }

} 
Example 6
Source File: driver.scala    From proxcocoa   with Apache License 2.0 5 votes vote down vote up
package l1distopt

import breeze.linalg.SparseVector
import l1distopt.utils._
import l1distopt.solvers._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkConf}


object driver {

  def main(args: Array[String]) {

    val options =  args.map { arg =>
      arg.dropWhile(_ == '-').split('=') match {
        case Array(opt, v) => (opt -> v)
        case Array(opt) => (opt -> "true")
        case _ => throw new IllegalArgumentException("Invalid argument: " + arg)
      }
      }.toMap

    // read in inputs
    val master = options.getOrElse("master", "local[4]")
    val trainFile = options.getOrElse("trainFile", "")
    val numFeatures = options.getOrElse("numFeatures", "0").toInt
    val numSplits = options.getOrElse("numSplits", "1").toInt
    val testFile = options.getOrElse("testFile", "")
    
    // algorithm-specific inputs
    val eta = options.getOrElse("eta", "1.0").toDouble // elastic net parameter: 1.0 = lasso, 0.0 = ridge regression
    val lambda = options.getOrElse("lambda", "0.01").toDouble // regularization parameter
    val numRounds = options.getOrElse("numRounds", "200").toInt // number of outer iterations, called T in the paper
    val localIterFrac = options.getOrElse("localIterFrac", "1.0").toDouble; // fraction of local points to be processed per round, H = localIterFrac * n
    val debugIter = options.getOrElse("debugIter", "10").toInt // set to -1 to turn off debugging output
    val seed = options.getOrElse("seed", "0").toInt // set seed for debug purposes

    // print out inputs
    println("master:       " + master);          println("trainFile:    " + trainFile);
    println("numFeatures:  " + numFeatures);     println("numSplits:    " + numSplits);      
    println("testfile:     " + testFile);        println("eta           " + eta);       
    println("lambda:       " + lambda);          println("numRounds:    " + numRounds);       
    println("localIterFrac:" + localIterFrac);   println("debugIter     " + debugIter);       
    println("seed          " + seed);            

    // start spark context
    val conf = new SparkConf().setMaster(master)
    .setAppName("Sparse-CoCoA")
    .setJars(SparkContext.jarOfObject(this).toSeq)
    val sc = new SparkContext(conf)


    

    val finalAlphaCoCoA = ProxCoCoAp.runProxCoCoAp(data, labels, params, debug)
    sc.stop()
  }
} 
Example 7
Source File: NearestNeighbors.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package utils

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import com.github.fommil.netlib.BLAS
import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import scala.collection.mutable.ArrayBuffer

object NearestNeighbors {

	def runNearestNeighbors(data: RDD[Array[(LabeledPoint,Int,Int)]], 
		kNN: Int, 
		sampleData: Array[(LabeledPoint,Int,Int)]): Array[(String,Array[((Int,Int),Double)])] = {
		
		val globalNearestNeighborsByIndex = data.mapPartitionsWithIndex(localNearestNeighbors(_,_,kNN,sampleData)).groupByKey().map(x => (x._1,x._2.toArray.sortBy(r => r._2).take(kNN))).collect()		

		globalNearestNeighborsByIndex 
	}


	private def localNearestNeighbors(partitionIndex: Long,
		iter: Iterator[Array[(LabeledPoint,Int,Int)]],
		kNN: Int,
		sampleData: Array[(LabeledPoint,Int,Int)]): Iterator[(String,((Int,Int),Double))] = { 
			
			var result = List[(String,((Int,Int),Double))]()
			val dataArr = iter.next
			val nLocal = dataArr.size - 1			
			val sampleDataSize = sampleData.size - 1


			val kLocalNeighbors = Array.fill[distanceIndex](sampleDataSize+1)(null)
			for {
			    i1 <- 0 to sampleDataSize
			} 
			kLocalNeighbors(i1) = distanceIndex(sampleData(i1)._3.toInt, sampleData(i1)._2.toInt, DenseVector.zeros[Double](kNN) + Int.MaxValue.toDouble, DenseVector.zeros[Int](kNN))

			for (i <- 0 to nLocal) {
				val currentPoint = dataArr(i)
				val features = currentPoint._1.features
				val rowId = currentPoint._3.toInt	
				for (j <- 0 to sampleDataSize) {
					val samplePartitionId = sampleData(j)._2
					val sampleRowId = sampleData(j)._3
					val sampleFeatures = sampleData(j)._1.features
					if (!((rowId == sampleRowId) & (samplePartitionId == partitionIndex))) {
						val distance = Math.sqrt(sum((sampleFeatures - features) :* (sampleFeatures - features)))
						if (distance < max(kLocalNeighbors(j).distanceVector)) {
							val indexToReplace = argmax(kLocalNeighbors(j).distanceVector)
							kLocalNeighbors(j).distanceVector(indexToReplace) = distance
							kLocalNeighbors(j).neighborRowId(indexToReplace) = rowId
						}
					}
				}
			}
			for (m <- 0 to sampleDataSize){
				for (l <-0 to kNN-1) {
					
					val key = kLocalNeighbors(m).partitionId.toString+","+kLocalNeighbors(m).sampleRowId.toString
					val tup = (partitionIndex.toInt,kLocalNeighbors(m).neighborRowId(l))
					result.::=(key,(tup,kLocalNeighbors(m).distanceVector(l)))
				}
			}			
		result.iterator 
	}	
} 
Example 8
Source File: loadData.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package utils

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object loadData {

 	def readDelimitedData(sc: SparkContext, path: String, numFeatures: Int, delimiter: String, numPartitions: Int): RDD[(LabeledPoint,Int,Int)] = {
		val data = sc.textFile(path).filter{x => x.split(delimiter)(0).toDouble == 1.0}.repartition(numPartitions).mapPartitions{x => Iterator(x.toArray)}
		val formatData = data.mapPartitionsWithIndex{(partitionId,iter) =>
			var result = List[(LabeledPoint,Int,Int)]()
			val dataArray = iter.next
			val dataArraySize = dataArray.size - 1
			var rowCount = dataArraySize
			for (i <- 0 to dataArraySize) {
				val parts = dataArray(i).split(delimiter)
				result.::=((LabeledPoint(parts(0).toDouble,DenseVector(parts.slice(1,numFeatures+1)).map(_.toDouble)),partitionId.toInt,rowCount))
				rowCount = rowCount - 1
			}
			result.iterator
		}

		formatData
	}
	
} 
Example 9
Source File: SMOTE.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package SMOTE

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import com.github.fommil.netlib.BLAS
import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import scala.collection.mutable.ArrayBuffer
import utils._

object SMOTE {

	def runSMOTE(sc: SparkContext, 
		inPath: String, 
		outPath: String,
		numFeatures: Int,  
		oversamplingPctg: Double,
        kNN: Int,
		delimiter: String,
        numPartitions: Int): Unit = {

		val rand = new Random()

		val data = loadData.readDelimitedData(sc, inPath, numFeatures, delimiter, numPartitions)
		
		val dataArray = data.mapPartitions(x => Iterator(x.toArray)).cache()

        val numObs = dataArray.map(x => x.size).reduce(_+_)

		println("Number of Filtered Observations "+numObs.toString)		

		val roundPctg = oversamplingPctg
        val sampleData = dataArray.flatMap(x => x).sample(withReplacement = false, fraction = roundPctg, seed = 1L).collect().sortBy(r => (r._2,r._3)) //without Replacement

		println("Sample Data Count "+sampleData.size.toString)

	 	val globalNearestNeighbors = NearestNeighbors.runNearestNeighbors(dataArray, kNN, sampleData)
		
        var randomNearestNeighbor = globalNearestNeighbors.map(x => (x._1.split(",")(0).toInt,x._1.split(",")(1).toInt,x._2(rand.nextInt(kNN)))).sortBy(r => (r._1,r._2))
		
        var sampleDataNearestNeighbors = randomNearestNeighbor.zip(sampleData).map(x => (x._1._3._1._1, x._1._2, x._1._3._1._2, x._2._1))

		val syntheticData = dataArray.mapPartitionsWithIndex(createSyntheticData(_,_,sampleDataNearestNeighbors,delimiter)).persist()
		println("Synthetic Data Count "+syntheticData.count.toString)
		val newData = syntheticData.union(sc.textFile(inPath))
		println("New Line Count "+newData.count.toString)
		newData.saveAsTextFile(outPath)
	
	}

	private def createSyntheticData(partitionIndex: Long,
		iter: Iterator[Array[(LabeledPoint,Int,Int)]],
		sampleDataNN: Array[(Int,Int,Int,LabeledPoint)],
		delimiter: String): Iterator[String]  = {
			
			var result = List[String]()
			val dataArr = iter.next
			val nLocal = dataArr.size - 1			
			val sampleDataNNSize = sampleDataNN.size - 1
			val rand = new Random()			

			for (j <- 0 to sampleDataNNSize){
				val partitionId = sampleDataNN(j)._1
				val neighborId = sampleDataNN(j)._3
				val sampleFeatures = sampleDataNN(j)._4.features
				if (partitionId == partitionIndex.toInt){
					val currentPoint = dataArr(neighborId)	
					val features = currentPoint._1.features	
					sampleFeatures += (sampleFeatures - features) * rand.nextDouble
					result.::=("1.0"+delimiter+sampleFeatures.toArray.mkString(delimiter))	
				}
			}
		result.iterator
	}		
} 
Example 10
Source File: TensorLDAModelTest.scala    From spectrallda-tensorspark   with Apache License 2.0 5 votes vote down vote up
package edu.uci.eecs.spectralLDA.algorithm

import breeze.linalg.{DenseMatrix, DenseVector, SparseVector, norm}
import breeze.numerics.abs
import org.scalatest._
import org.apache.spark.SparkContext
import edu.uci.eecs.spectralLDA.testharness.Context

class TensorLDAModelTest extends FlatSpec with Matchers {

  private val sc: SparkContext = Context.getSparkContext

  "Multinomial log-likelihood" should "be correct" in {
    val p = DenseVector[Double](0.2, 0.5, 0.3)
    val x1 = DenseVector[Double](20, 50, 30)
    val x2 = DenseVector[Double](40, 40, 20)

    abs(TensorLDAModel.multinomialLogLikelihood(p, x1) - (-4.697546)) should be <= 1e-6
    abs(TensorLDAModel.multinomialLogLikelihood(p, x2) - (-15.42038)) should be <= 1e-6
  }
} 
Example 11
Source File: GibbsSample.scala    From glintlda   with MIT License 5 votes vote down vote up
package glintlda

import breeze.linalg.{DenseVector, SparseVector, sum}
import glintlda.util.FastRNG


  def apply(sv: SparseVector[Int], random: FastRNG, topics: Int): GibbsSample = {
    val totalTokens = sum(sv)
    val sample = new GibbsSample(new Array[Int](totalTokens), new Array[Int](totalTokens))

    var i = 0
    var current = 0
    while (i < sv.activeSize) {
      val index = sv.indexAt(i)
      var value = sv.valueAt(i)
      while (value > 0) {
        sample.features(current) = index
        sample.topics(current) = random.nextPositiveInt() % topics
        current += 1
        value -= 1
      }
      i += 1
    }

    sample
  }

} 
Example 12
Source File: ScoreTest.scala    From seqspark   with Apache License 2.0 5 votes vote down vote up
package org.dizhang.seqspark.stat

import breeze.linalg.{*, CSCMatrix, DenseMatrix, DenseVector, SparseVector}
import org.dizhang.seqspark.stat.HypoTest.NullModel.{Fitted => SNM}
import org.dizhang.seqspark.util.General._


object ScoreTest {

  def apply(nm: SNM, x: CSCMatrix[Double]): ScoreTest = {
    Sparse(nm, x)
  }

  def apply(nm: SNM, x: DenseMatrix[Double]): ScoreTest = {
    Dense(nm, x)
  }

  def apply(nm: SNM, x: DenseVector[Double]): ScoreTest = {
    Dense(nm, DenseVector.horzcat(x))
  }

  def apply(nm: SNM, x: SparseVector[Double]): ScoreTest = {
    Sparse(nm, SparseVector.horzcat(x))
  }

  def apply(nm: SNM,
            x1: DenseMatrix[Double],
            x2: CSCMatrix[Double]): ScoreTest = {
    Mixed(nm, x1, x2)
  }

  case class Sparse(nm: SNM,
                    x: CSCMatrix[Double]) extends ScoreTest {
    val score = (nm.residuals.toDenseMatrix * x).toDenseVector / nm.a
    lazy val variance = {
      val c = nm.xs
      val IccInv = nm.invInfo * nm.a
      val Igg = (colMultiply(x, nm.b).t * x).toDense
      val Icg = (c(::, *) *:* nm.b).t * x
      val Igc = Icg.t
      (Igg - Igc * IccInv * Icg) / nm.a
    }
  }

  case class Dense(nm: SNM,
                   x: DenseMatrix[Double]) extends ScoreTest {
    val score = x.t * nm.residuals / nm.a
    lazy val variance = {
      val c = nm.xs
      val IccInv = nm.invInfo * nm.a
      val Igg = (x(::, *) *:* nm.b).t * x
      val Icg = (c(::, *) *:* nm.b).t * x
      val Igc = Icg.t
      (Igg - Igc * IccInv * Icg)/nm.a
    }
  }

  case class Mixed(nm: SNM,
                   x1: DenseMatrix[Double],
                   x2: CSCMatrix[Double]) extends ScoreTest {
    private val dense = Dense(nm, x1)
    private val sparse = Sparse(nm, x2)
    val score = DenseVector.vertcat(dense.score, sparse.score)
    lazy val variance = {
      val v1 = dense.variance
      val v4 = sparse.variance
      val v2 = {
        val c = nm.xs
        val IccInv = nm.invInfo * nm.a
        val Igg = (x1(::, *) *:* nm.b).t * x2
        val Icg = (c(::, *) *:* nm.b).t * x2
        val Igc = x1.t * (c(::, *) *:* nm.b).t
        (Igg - Igc * IccInv * Icg) / nm.a
      }
      val v3 = v2.t
      val v12 = DenseMatrix.horzcat(v1, v2)
      val v34 = DenseMatrix.horzcat(v3, v4)
      DenseMatrix.vertcat(v12, v34)
    }
  }

  case class Mock(score: DenseVector[Double],
                  variance: DenseMatrix[Double]) extends ScoreTest
}

@SerialVersionUID(7778780001L)
sealed trait ScoreTest extends HypoTest {
  def score: DenseVector[Double]
  def variance: DenseMatrix[Double]
} 
Example 13
Source File: NewsgroupsPipeline.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.pipelines.text

import breeze.linalg.SparseVector
import keystoneml.evaluation.MulticlassClassifierEvaluator
import keystoneml.loaders.NewsgroupsDataLoader
import keystoneml.nodes.learning.NaiveBayesEstimator
import keystoneml.nodes.nlp._
import keystoneml.nodes.stats.TermFrequency
import keystoneml.nodes.util.{CommonSparseFeatures, MaxClassifier}
import org.apache.spark.{SparkConf, SparkContext}
import keystoneml.pipelines.Logging
import scopt.OptionParser
import keystoneml.workflow.Pipeline

object NewsgroupsPipeline extends Logging {
  val appName = "NewsgroupsPipeline"

  def run(sc: SparkContext, conf: NewsgroupsConfig): Pipeline[String, Int] = {

    val trainData = NewsgroupsDataLoader(sc, conf.trainLocation)
    val numClasses = NewsgroupsDataLoader.classes.length

    // Build the classifier estimator
    logInfo("Training classifier")
    val predictor = Trim andThen
        LowerCase() andThen
        Tokenizer() andThen
        NGramsFeaturizer(1 to conf.nGrams) andThen
        TermFrequency(x => 1) andThen
        (CommonSparseFeatures[Seq[String]](conf.commonFeatures), trainData.data) andThen
        (NaiveBayesEstimator[SparseVector[Double]](numClasses), trainData.data, trainData.labels) andThen
        MaxClassifier

    // Evaluate the classifier
    logInfo("Evaluating classifier")

    val testData = NewsgroupsDataLoader(sc, conf.testLocation)
    val testLabels = testData.labels
    val testResults = predictor(testData.data)
    val eval = new MulticlassClassifierEvaluator(numClasses).evaluate(testResults, testLabels)

    logInfo("\n" + eval.summary(NewsgroupsDataLoader.classes))

    predictor
  }

  case class NewsgroupsConfig(
    trainLocation: String = "",
    testLocation: String = "",
    nGrams: Int = 2,
    commonFeatures: Int = 100000)

  def parse(args: Array[String]): NewsgroupsConfig = new OptionParser[NewsgroupsConfig](appName) {
    head(appName, "0.1")
    opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) }
    opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) }
    opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) }
    opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) }
  }.parse(args, NewsgroupsConfig()).get

  
  def main(args: Array[String]) = {
    val conf = new SparkConf().setAppName(appName)
    conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit.

    val sc = new SparkContext(conf)

    val appConfig = parse(args)
    run(sc, appConfig)

    sc.stop()
  }

} 
Example 14
Source File: AmazonReviewsPipeline.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.pipelines.text

import breeze.linalg.SparseVector
import keystoneml.evaluation.BinaryClassifierEvaluator
import keystoneml.loaders.{AmazonReviewsDataLoader, LabeledData}
import keystoneml.nodes.learning.LogisticRegressionEstimator
import keystoneml.nodes.nlp._
import keystoneml.nodes.stats.TermFrequency
import keystoneml.nodes.util.CommonSparseFeatures
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import keystoneml.pipelines.Logging
import scopt.OptionParser
import keystoneml.workflow.Pipeline

object AmazonReviewsPipeline extends Logging {
  val appName = "AmazonReviewsPipeline"

  def run(spark: SparkSession, conf: AmazonReviewsConfig): Pipeline[String, Double] = {
    val amazonTrainData = AmazonReviewsDataLoader(spark, conf.trainLocation, conf.threshold).labeledData
    val trainData = LabeledData(amazonTrainData.repartition(conf.numParts).cache())

    val training = trainData.data
    val labels = trainData.labels

    // Build the classifier estimator
    val predictor = Trim andThen
        LowerCase() andThen
        Tokenizer() andThen
        NGramsFeaturizer(1 to conf.nGrams) andThen
        TermFrequency(x => 1) andThen
        (CommonSparseFeatures[Seq[String]](conf.commonFeatures), training) andThen
        (LogisticRegressionEstimator[SparseVector[Double]](numClasses = 2, numIters = conf.numIters), training, labels)

    // Evaluate the classifier
    val amazonTestData = AmazonReviewsDataLoader(spark, conf.testLocation, conf.threshold).labeledData
    val testData = LabeledData(amazonTestData.repartition(conf.numParts).cache())
    val testLabels = testData.labels
    val testResults = predictor(testData.data)
    val eval = BinaryClassifierEvaluator.evaluate(testResults.get.map(_ > 0), testLabels.map(_ > 0))

    logInfo("\n" + eval.summary())
    predictor
  }

  case class AmazonReviewsConfig(
    trainLocation: String = "",
    testLocation: String = "",
    threshold: Double = 3.5,
    nGrams: Int = 2,
    commonFeatures: Int = 100000,
    numIters: Int = 20,
    numParts: Int = 512)

  def parse(args: Array[String]): AmazonReviewsConfig = new OptionParser[AmazonReviewsConfig](appName) {
    head(appName, "0.1")
    opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) }
    opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) }
    opt[Double]("threshold") action { (x,c) => c.copy(threshold=x)}
    opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) }
    opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) }
    opt[Int]("numIters") action { (x,c) => c.copy(numParts=x) }
    opt[Int]("numParts") action { (x,c) => c.copy(numParts=x) }
  }.parse(args, AmazonReviewsConfig()).get

  
  def main(args: Array[String]) = {
    val conf = new SparkConf().setAppName(appName)
    conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit.

    val spark = SparkSession.builder.config(conf).getOrCreate()

    val appConfig = parse(args)
    run(spark, appConfig)

    spark.stop()
  }
} 
Example 15
Source File: AllSparseFeatures.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.util

import breeze.linalg.SparseVector
import org.apache.spark.rdd.RDD
import keystoneml.workflow.Estimator

import scala.reflect.ClassTag


case class AllSparseFeatures[T: ClassTag]() extends Estimator[Seq[(T, Double)], SparseVector[Double]] {
  override def fit(data: RDD[Seq[(T, Double)]]): SparseFeatureVectorizer[T] = {
    val featureOccurrences = data.flatMap(_.map(_._1))
    // zip with unique ids and take the smallest unique id for a given feature to get
    // a deterministic ordering
    val featuresWithUniqueId = featureOccurrences.zipWithUniqueId().reduceByKey {
      (x, y) => Math.min(x, y)
    }
    val featureSpace = featuresWithUniqueId.sortBy(_._2).map(_._1)
        .collect().zipWithIndex.toMap
    new SparseFeatureVectorizer(featureSpace)
  }
} 
Example 16
Source File: SparseFeatureVectorizer.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.util

import breeze.linalg.SparseVector
import keystoneml.workflow.Transformer


class SparseFeatureVectorizer[T](featureSpace: Map[T, Int]) extends Transformer[Seq[(T, Double)], SparseVector[Double]] {
  private def transformVector(in: Seq[(T, Double)], featureSpaceMap: Map[T, Int]): SparseVector[Double] = {
    val features = in.map(f => (featureSpaceMap.get(f._1), f._2))
        .filter(_._1.isDefined)
        .map(f => (f._1.get, f._2.toDouble))
    SparseVector(featureSpaceMap.size)(features:_*)
  }

  override def apply(in: Seq[(T, Double)]): SparseVector[Double] = {
    transformVector(in, featureSpace)
  }
} 
Example 17
Source File: NGramsHashingTF.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.nlp

import java.lang.Integer.{rotateLeft => rotl}

import breeze.linalg.SparseVector
import keystoneml.workflow.Transformer

import scala.collection.mutable


  private final def avalanche(hash: Int): Int = {
    var h = hash

    h ^= h >>> 16
    h *= 0x85ebca6b
    h ^= h >>> 13
    h *= 0xc2b2ae35
    h ^= h >>> 16

    h
  }

  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  def apply(line: Seq[String]): SparseVector[Double] = {
    val hashes = new Array[Integer](line.length)
    var i = 0
    while (i < line.length) {
      hashes(i) = line(i).##
      i += 1
    }

    var j = 0
    val termFrequencies = mutable.HashMap.empty[Int, Double]
    i = 0
    while (i + minOrder <= line.length) {
      var order = minOrder
      var h = seqSeed

      j = i
      while (j < i + minOrder) {
        h = mix(h, hashes(j))
        j += 1
      }

      val feature = nonNegativeMod(finalizeHash(h, order), numFeatures)
      termFrequencies.put(feature, termFrequencies.getOrElse(feature, 0.0) + 1.0)

      order = minOrder + 1
      while (order <= maxOrder && i + order <= line.length) {
        h = mix(h, hashes(i + order - 1))
        val feature = nonNegativeMod(finalizeHash(h, order), numFeatures)
        termFrequencies.put(feature, termFrequencies.getOrElse(feature, 0.0) + 1.0)
        order += 1
      }
      i += 1
    }

    SparseVector(numFeatures)(termFrequencies.toSeq:_*)
  }

} 
Example 18
Source File: HashingTF.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.nlp

import breeze.linalg.SparseVector
import keystoneml.workflow.Transformer


case class HashingTF[T <: Seq[Any]](numFeatures: Int) extends Transformer[T, SparseVector[Double]] {
  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  def apply(document: T): SparseVector[Double] = {
    val termFrequencies = scala.collection.mutable.HashMap.empty[Int, Double]
    document.foreach { term =>
      val i = nonNegativeMod(term.##, numFeatures)
      termFrequencies.put(i, termFrequencies.getOrElse(i, 0.0) + 1.0)
    }

    SparseVector(numFeatures)(termFrequencies.toSeq:_*)
  }
} 
Example 19
Source File: MLlibUtils.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.utils

import breeze.linalg.{SparseVector, DenseMatrix, DenseVector}


  def breezeVectorToMLlib(breezeVector: breeze.linalg.Vector[Double]): org.apache.spark.mllib.linalg.Vector = {
    breezeVector match {
      case v: DenseVector[Double] =>
        if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
          new org.apache.spark.mllib.linalg.DenseVector(v.data)
        } else {
          new org.apache.spark.mllib.linalg.DenseVector(v.toArray)  // Can't use underlying array directly, so make a new one
        }
      case v: SparseVector[Double] =>
        if (v.index.length == v.used) {
          new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index, v.data)
        } else {
          new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
        }
      case v: breeze.linalg.Vector[_] =>
        sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
    }
  }

} 
Example 20
Source File: SparseArray.scala    From scalismo-faces   with Apache License 2.0 5 votes vote down vote up
package scalismo.faces.numerics

import java.util

import breeze.linalg.SparseVector


private[numerics] class SparseArray(var index: Array[Int], var data: Array[Double], var nnz: Int, val length: Int) {
  require(nnz <= length, "too many non zeros")
  require(data.length == index.length, "data and index have different length")
  require(data.length >= nnz, "data array is too short")

  def activeSize = nnz

  def update(i: Int, v: Double): Unit = {
    val offset = findOffset(i)
    if (offset >= 0)
      data(offset) = v
    else {
      val insert = ~offset
      nnz += 1
      if (nnz > index.length) reallocate()

      // insert
      // move right part
      System.arraycopy(index, insert, index, insert + 1, nnz - insert - 1)
      System.arraycopy(data, insert, data, insert + 1, nnz - insert - 1)
      // insert data
      index(insert) = i
      data(insert) = v
    }
  }

  private def reallocate() = {
    val newLength = math.max(nnz + 1, index.length * 2)
    val _index = new Array[Int](newLength)
    val _data = new Array[Double](newLength)
    System.arraycopy(index, 0, _index, 0, index.length)
    System.arraycopy(data, 0, _data, 0, data.length)
    index = _index
    data = _data
  }

  def apply(i: Int): Double = {
    val offset = findOffset(i)
    if (offset >= 0)
      data(offset)
    else
      0.0
  }

  def findOffset(i: Int): Int = util.Arrays.binarySearch(index, 0, nnz, i)

  def toDense: Array[Double] = {
    val arr = new Array[Double](length)
    var i = 0
    while (i < nnz) {
      val ind = index(i)
      arr(ind) = data(i)
      i += 1
    }
    arr
  }
}

object SparseArray {
  def apply(vector: SparseVector[Double]): SparseArray = {
    val nnz = vector.activeSize
    val index = new Array[Int](nnz)
    val data = new Array[Double](nnz)
    System.arraycopy(vector.index, 0, index, 0, nnz)
    System.arraycopy(vector.data, 0, data, 0, nnz)
    new SparseArray(index, data, nnz, vector.length)
  }
}