org.apache.spark.ml.feature.CountVectorizerModel Scala Examples

The following examples show how to use org.apache.spark.ml.feature.CountVectorizerModel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: CountVectorizerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SparkSession

object CountVectorizerExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("CountVectorizerExample")
      .getOrCreate()

    // $example on$
    val df = spark.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: LocalCountVectorizerModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.linalg.Vectors

import scala.collection.mutable

class LocalCountVectorizerModel(override val sparkTransformer: CountVectorizerModel)
  extends LocalTransformer[CountVectorizerModel] {
  override def transform(localData: LocalData): LocalData = {
    import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
    val dict  = sparkTransformer.vocabulary.zipWithIndex.toMap
    val minTf = sparkTransformer.getMinTF

    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val newCol = column.data.map(_.asInstanceOf[List[String]]).map { arr =>
          val termCounts = mutable.HashMap.empty[Int, Double]
          var tokenCount = 0L
          arr.foreach { token =>
            dict.get(token) foreach { index =>
              val storedValue = termCounts.getOrElseUpdate(index, 0.0)
              termCounts.update(index, storedValue + 1.0)
            }
            tokenCount += 1
          }
          val eTF = if (minTf >= 1.0) minTf else tokenCount * minTf
          val eCounts = if (sparkTransformer.getBinary) {
            termCounts filter (_._2 >= eTF) map (_._1 -> 1.0) toSeq
          } else {
            termCounts filter (_._2 >= eTF) toSeq
          }

          Vectors.sparse(dict.size, eCounts.toList).toList
        }
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newCol))
      case None => localData
    }
  }
}

object LocalCountVectorizerModel
  extends SimpleModelLoader[CountVectorizerModel]
  with TypedTransformerConverter[CountVectorizerModel] {

  override def build(metadata: Metadata, data: LocalData): CountVectorizerModel = {
    val vocabulary = data.column("vocabulary").get.data.head.asInstanceOf[Seq[String]].toArray
    val inst       = new CountVectorizerModel(metadata.uid, vocabulary)
    inst
      .setInputCol(metadata.paramMap("inputCol").toString)
      .setOutputCol(metadata.paramMap("outputCol").toString)
      .set(inst.binary, metadata.paramMap("binary").asInstanceOf[Boolean])
      .set(inst.minDF, metadata.paramMap("minDF").toString.toDouble)
      .set(inst.minTF, metadata.paramMap("minTF").toString.toDouble)
      .set(inst.vocabSize, metadata.paramMap("vocabSize").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(
    transformer: CountVectorizerModel
  ) = new LocalCountVectorizerModel(transformer)
} 
Example 3
Source File: CountVectorizerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.param.Param


class CountVectorizerOp extends SimpleSparkOp[CountVectorizerModel] {
  override val Model: OpModel[SparkBundleContext, CountVectorizerModel] = new OpModel[SparkBundleContext, CountVectorizerModel] {
    override val klazz: Class[CountVectorizerModel] = classOf[CountVectorizerModel]

    override def opName: String = Bundle.BuiltinOps.feature.count_vectorizer

    override def store(model: Model, obj: CountVectorizerModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("vocabulary", Value.stringList(obj.vocabulary)).
        withValue("binary", Value.boolean(obj.getBinary)).
        withValue("min_tf", Value.double(obj.getMinTF))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): CountVectorizerModel = {
      new CountVectorizerModel(uid = "",
        vocabulary = model.value("vocabulary").getStringList.toArray).
        setBinary(model.value("binary").getBoolean).
        setMinTF(model.value("min_tf").getDouble)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: CountVectorizerModel): CountVectorizerModel = {
    new CountVectorizerModel(uid = uid,
      vocabulary = model.vocabulary)
      .setBinary(model.getBinary)
      .setMinTF(model.getMinTF)
  }

  override def sparkInputs(obj: CountVectorizerModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: CountVectorizerModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 4
Source File: CountVectorizerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SparkSession

object CountVectorizerExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("CountVectorizerExample")
      .getOrCreate()

    // $example on$
    val df = spark.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 5
Source File: CountVectorizerExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SparkSession

object CountVectorizerExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("CountVectorizerExample")
      .getOrCreate()

    // $example on$
    val df = spark.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: OpCountVectorizer.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryModel
import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.sql.Dataset


  def setVocabSize(value: Int): this.type = {
    getSparkMlStage().get.setVocabSize(value)
    this
  }

  override def fit(dataset: Dataset[_]): SwUnaryModel[TextList, OPVector, CountVectorizerModel] = {
    val model = super.fit(dataset)
    val vocab = model.getSparkMlStage().map(_.vocabulary).getOrElse(Array.empty[String])
    val tf = getTransientFeatures()

    val metadataCols = for {
      f <- tf
      word <- vocab
    } yield OpVectorColumnMetadata(
      parentFeatureName = Seq(f.name),
      parentFeatureType = Seq(f.typeName),
      grouping = None, // TODO do we want to test each word for label pred?
      indicatorValue = Option(word)
    )

    model.setMetadata(
      OpVectorMetadata(getOutputFeatureName, metadataCols,
        Transmogrifier.inputFeaturesToHistory(tf, stageName)).toMetadata
    )
    model
  }
} 
Example 7
Source File: CountVectorizerExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    cvm.transform(df).select("features").show()
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: CountVectorizerExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SparkSession

object CountVectorizerExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("CountVectorizerExample")
      .getOrCreate()

    // $example on$
    val df = spark.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: CountVectorizerExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}


object CountVectorizerExample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("CounterVectorizerExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    val df = sqlContext.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).select("features").show()
    // $example off$
  }
}
// scalastyle:on println 
Example 10
package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }

object CountVectorizerDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val df = spark.createDataFrame(
      Seq((0, Array("Jason", "David")),
        (1, Array("David", "Martin")),
        (2, Array("Martin", "Jason")),
        (3, Array("Jason", "Daiel")),
        (4, Array("Daiel", "Martin")),
        (5, Array("Moahmed", "Jason")),
        (6, Array("David", "David")),
        (7, Array("Jason", "Martin")))).toDF("id", "name")

    df.show(false)

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("name")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    val feature = cvModel.transform(df)
    feature.show(false)

    spark.stop()
  }
}