org.apache.spark.ml.feature.CountVectorizer Scala Examples

The following examples show how to use org.apache.spark.ml.feature.CountVectorizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: CountVectorizerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SparkSession

object CountVectorizerExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("CountVectorizerExample")
      .getOrCreate()

    // $example on$
    val df = spark.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: TokenizerSuite.scala    From spark-nkp   with Apache License 2.0 5 votes vote down vote up
package com.github.uosdmlab.nkp

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, IDF}
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfter, FunSuite}


class TokenizerSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfter {

  private var tokenizer: Tokenizer = _

  private val spark: SparkSession =
    SparkSession.builder()
      .master("local[2]")
      .appName("Tokenizer Suite")
      .getOrCreate

  spark.sparkContext.setLogLevel("WARN")

  import spark.implicits._

  override protected def afterAll(): Unit = {
    try {
      spark.stop
    } finally {
      super.afterAll()
    }
  }

  before {
    tokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("words")
  }

  private val df = spark.createDataset(
    Seq(
      "아버지가방에들어가신다.",
      "사랑해요 제플린!",
      "스파크는 재밌어",
      "나는야 데이터과학자",
      "데이터야~ 놀자~"
    )
  ).toDF("text")

  test("Default parameters") {
    assert(tokenizer.getFilter sameElements Array.empty[String])
  }

  test("Basic operation") {
    val words = tokenizer.transform(df)

    assert(df.count == words.count)
    assert(words.schema.fieldNames.contains(tokenizer.getOutputCol))
  }

  test("POS filter") {
    val nvTokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("nvWords")
      .setFilter("N", "V")

    val words = tokenizer.transform(df).join(nvTokenizer.transform(df), "text")

    assert(df.count == words.count)
    assert(words.schema.fieldNames.contains(nvTokenizer.getOutputCol))
    assert(words.where(s"SIZE(${tokenizer.getOutputCol}) < SIZE(${nvTokenizer.getOutputCol})").count == 0)
  }

  test("TF-IDF pipeline") {
    tokenizer.setFilter("N")

    val cntVec = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("tf")

    val idf = new IDF()
      .setInputCol("tf")
      .setOutputCol("tfidf")

    val pipe = new Pipeline()
      .setStages(Array(tokenizer, cntVec, idf))

    val pipeModel = pipe.fit(df)

    val result = pipeModel.transform(df)

    assert(result.count == df.count)

    val fields = result.schema.fieldNames
    assert(fields.contains(tokenizer.getOutputCol))
    assert(fields.contains(cntVec.getOutputCol))
    assert(fields.contains(idf.getOutputCol))

    result.show
  }
} 
Example 3
Source File: CountVectorizerParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.feature

import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.feature.{CountVectorizer, Tokenizer}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.sql._


class CountVectorizerParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("loan_title")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer().
    setInputCol("loan_title").
    setOutputCol("loan_title_tokens"),
    new CountVectorizer().
      setInputCol("loan_title_tokens").
      setOutputCol("loan_title_token_counts")
  .setMinTF(2))).fit(dataset)
} 
Example 4
Source File: MinMaxScalerPipelineParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.feature

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, MinMaxScaler, QuantileDiscretizer, VectorAssembler}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.sql._

class MinMaxScalerPipelineParitySpec extends SparkParityBase {

  private val getKeys: Map[String, Double] => Seq[String] = { input: Map[String, Double] => input.keySet.toSeq }

  val keyUdf = functions.udf(getKeys)

  override val dataset = spark.createDataFrame(Seq(
    (Array("1"), 1.0, Map("a" -> 0.1, "b" -> 0.2, "c" -> 0.3), 1),
    (Array("2"), 10.0, Map("d" -> 0.1, "e" -> 0.2, "c" -> 0.3), 0),
    (Array("3"), 20.0, Map("x" -> 0.1, "a" -> 0.2, "b" -> 0.3), 0),
    (Array("4"), 15.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0),
    (Array("5"), 18.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0),
    (Array("6"), 25.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 1),
    (Array("6"), 5.0, Map("a" -> 0.1, "b" -> 0.2, "d" -> 0.3), 0),
    (Array("7"), 30.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0))
  )
    .toDF("book_id", "pv", "myInputCol0", "label")
    .withColumn("myInputCol", keyUdf(functions.col("myInputCol0")))
    .drop("myInputCol0")

  override val sparkTransformer = new Pipeline()
    .setStages(Array(new CountVectorizer()
      .setInputCol("book_id")
      .setOutputCol("book_id_vec")
      .setMinDF(1)
      .setMinTF(1)
      .setBinary(true),
      new QuantileDiscretizer()
        .setInputCol("pv")
        .setOutputCol("pv_bucket")
        .setNumBuckets(3),
      new CountVectorizer()
        .setInputCol("myInputCol")
        .setOutputCol("myInputCol1_vec")
        .setMinDF(1)
        .setMinTF(1)
        .setBinary(true),
      new VectorAssembler()
        .setInputCols(Array("pv_bucket", "book_id_vec", "myInputCol1_vec"))
        .setOutputCol("vectorFeature"),
      new MinMaxScaler().setInputCol("vectorFeature").setOutputCol("scaledFeatures"))).fit(dataset)
} 
Example 5
Source File: LDAParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.clustering

import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.ml.feature.{CountVectorizer, StopWordsRemover, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.sql.DataFrame
import org.scalatest.Ignore


@Ignore
class LDAParitySpec extends SparkParityBase {
  override val dataset: DataFrame = textDataset.select("text")

  val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")

  val remover = new StopWordsRemover()
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("words_filtered")

  val cv = new CountVectorizer().setInputCol("words_filtered").setOutputCol("features").setVocabSize(50000)

  val lda = new LDA().setK(5).setMaxIter(2)

  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(tokenizer, remover, cv, lda)).fit(dataset)

  override def equalityTest(sparkDataset: DataFrame,
                            mleapDataset: DataFrame): Unit = {
    val sparkPredictionCol = sparkDataset.schema.fieldIndex("topicDistribution")
    val mleapPredictionCol = mleapDataset.schema.fieldIndex("topicDistribution")

    sparkDataset.collect().zip(mleapDataset.collect()).foreach {
      case (sv, mv) =>
        val sparkPrediction = sv.getAs[Vector](sparkPredictionCol)
        val mleapPrediction = mv.getAs[Vector](mleapPredictionCol)

        sparkPrediction.toArray.zip(mleapPrediction.toArray).foreach {
          case (s, m) => assert(Math.abs(m - s) < 0.001)
        }
    }
  }
} 
Example 6
Source File: CountVectorizerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SparkSession

object CountVectorizerExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("CountVectorizerExample")
      .getOrCreate()

    // $example on$
    val df = spark.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 7
Source File: Preprocessor.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package functions

import config.paramconf.PreprocessParams
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer}
import org.apache.spark.sql.DataFrame


  def preprocess(data: DataFrame): Pipeline = {
    val spark = data.sparkSession
    val params = new PreprocessParams

    val indexModel = new StringIndexer()
      .setHandleInvalid(params.handleInvalid)
      .setInputCol("label")
      .setOutputCol("indexedLabel")
      .fit(data)

    val cleaner = new Cleaner()
      .setFanJian(params.fanjian)
      .setQuanBan(params.quanban)
      .setMinLineLen(params.minLineLen)
      .setInputCol("content")
      .setOutputCol("cleand")

    val segmenter = new Segmenter()
      .isAddNature(params.addNature)
      .isDelEn(params.delEn)
      .isDelNum(params.delNum)
      .isNatureFilter(params.natureFilter)
      .setMinTermLen(params.minTermLen)
      .setMinTermNum(params.minTermNum)
      .setSegType(params.segmentType)
      .setInputCol(cleaner.getOutputCol)
      .setOutputCol("segmented")

    val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect()
    val remover = new StopWordsRemover()
      .setStopWords(stopwords)
      .setInputCol(segmenter.getOutputCol)
      .setOutputCol("removed")

    val vectorizer = new CountVectorizer()
      .setMinTF(params.minTF)
      .setVocabSize(params.vocabSize)
      .setInputCol(remover.getOutputCol)
      .setOutputCol("vectorized")

    val idf = new IDF()
      .setMinDocFreq(params.minDocFreq)
      .setInputCol(vectorizer.getOutputCol)
      .setOutputCol("features")

    val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf)
    new Pipeline().setStages(stages)
  }
} 
Example 8
Source File: CountVectorizerExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SparkSession

object CountVectorizerExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("CountVectorizerExample")
      .getOrCreate()

    // $example on$
    val df = spark.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: OpCountVectorizer.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryModel
import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.sql.Dataset


  def setVocabSize(value: Int): this.type = {
    getSparkMlStage().get.setVocabSize(value)
    this
  }

  override def fit(dataset: Dataset[_]): SwUnaryModel[TextList, OPVector, CountVectorizerModel] = {
    val model = super.fit(dataset)
    val vocab = model.getSparkMlStage().map(_.vocabulary).getOrElse(Array.empty[String])
    val tf = getTransientFeatures()

    val metadataCols = for {
      f <- tf
      word <- vocab
    } yield OpVectorColumnMetadata(
      parentFeatureName = Seq(f.name),
      parentFeatureType = Seq(f.typeName),
      grouping = None, // TODO do we want to test each word for label pred?
      indicatorValue = Option(word)
    )

    model.setMetadata(
      OpVectorMetadata(getOutputFeatureName, metadataCols,
        Transmogrifier.inputFeaturesToHistory(tf, stageName)).toMetadata
    )
    model
  }
} 
Example 10
Source File: CountVectorizerExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    cvm.transform(df).select("features").show()
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 11
Source File: CountVectorizerExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SparkSession

object CountVectorizerExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("CountVectorizerExample")
      .getOrCreate()

    // $example on$
    val df = spark.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 12
Source File: CountVectorizerExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}


object CountVectorizerExample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("CounterVectorizerExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    val df = sqlContext.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).select("features").show()
    // $example off$
  }
}
// scalastyle:on println 
Example 13
package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }

object CountVectorizerDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val df = spark.createDataFrame(
      Seq((0, Array("Jason", "David")),
        (1, Array("David", "Martin")),
        (2, Array("Martin", "Jason")),
        (3, Array("Jason", "Daiel")),
        (4, Array("Daiel", "Martin")),
        (5, Array("Moahmed", "Jason")),
        (6, Array("David", "David")),
        (7, Array("Jason", "Martin")))).toDF("id", "name")

    df.show(false)

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("name")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    val feature = cvModel.transform(df)
    feature.show(false)

    spark.stop()
  }
} 
Example 14
Source File: CountVectorizerSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase}
import org.apache.spark.ml.feature.CountVectorizer

class CountVectorizerSuite extends SparkFeaturePFASuiteBase[CountVectorizerResult] {

  val df = spark.createDataFrame(Seq(
    (0, Array("a", "b", "c", "d", "e", "f")),
    (1, Array("a", "b", "b", "c", "a"))
  )).toDF("id", "words")

  val cv = new CountVectorizer()
    .setInputCol("words")
    .setOutputCol("features")

  override val sparkTransformer = cv.fit(df)

  val result = sparkTransformer.transform(df)
  override val input = result.select(cv.getInputCol).toJSON.collect()
  override val expectedOutput = withColumnAsArray(result, cv.getOutputCol).toJSON.collect()

  // Additional test for MinTF
  test("CountVectorizer with MinTF = 0.3") {
    val cv = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setMinTF(0.3)

    val sparkTransformer = cv.fit(df)

    val result = sparkTransformer.transform(df)
    val input = result.select(cv.getInputCol).toJSON.collect()
    val expectedOutput = withColumnAsArray(result, cv.getOutputCol).toJSON.collect()

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("CountVectorizer with MinTF = 2.0") {
    val cv = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setMinTF(2.0)

    val sparkTransformer = cv.fit(df)

    val result = sparkTransformer.transform(df)
    val input = result.select(cv.getInputCol).toJSON.collect()
    val expectedOutput = withColumnAsArray(result, cv.getOutputCol).toJSON.collect()

    parityTest(sparkTransformer, input, expectedOutput)
  }

  // Additional test for binary
  test("CountVectorizer with binary") {
    val cv = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setBinary(true)

    val sparkTransformer = cv.fit(df)

    val result = sparkTransformer.transform(df)
    val input = result.select(cv.getInputCol).toJSON.collect()
    val expectedOutput = withColumnAsArray(result, cv.getOutputCol).toJSON.collect()

    parityTest(sparkTransformer, input, expectedOutput)
  }

}

case class CountVectorizerResult(features: Seq[Double]) extends Result