org.apache.spark.ml.feature.StopWordsRemover Scala Examples

The following examples show how to use org.apache.spark.ml.feature.StopWordsRemover. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: StopWordsRemoverExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StopWordsRemover
// $example off$
import org.apache.spark.sql.SparkSession

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StopWordsRemoverExample")
      .getOrCreate()

    // $example on$
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")

    val dataSet = spark.createDataFrame(Seq(
      (0, Seq("I", "saw", "the", "red", "balloon")),
      (1, Seq("Mary", "had", "a", "little", "lamb"))
    )).toDF("id", "raw")

    remover.transform(dataSet).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 2
package spark.ml.cookbook.chapter12

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, Word2Vec}
import org.apache.spark.sql.SparkSession

object ProcessWord2Vec20 {

  def main(args: Array[String]) {

    val input = "../data/sparkml2/chapter12/pg62.txt"

    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("Process Word2Vec  App")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    //import spark.implicits._

    Logger.getRootLogger.setLevel(Level.WARN)

    val df = spark.read.text(input).toDF("text")

    val tokenizer = new RegexTokenizer()
      .setPattern("\\W+")
      .setToLowercase(true)
      .setMinTokenLength(4)
      .setInputCol("text")
      .setOutputCol("raw")
    val rawWords = tokenizer.transform(df)

    val stopWords = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("terms")
      .setCaseSensitive(false)

    val wordTerms = stopWords.transform(rawWords)

    wordTerms.show(false)

    val word2Vec = new Word2Vec()
      .setInputCol("terms")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(wordTerms)

    val synonyms = model.findSynonyms("martian", 10)

    synonyms.show(false)

    spark.stop()
  }
} 
Example 3
Source File: StopWordsRemoverSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase}
import org.apache.spark.ml.feature.StopWordsRemover

class StopWordsRemoverSuite extends SparkFeaturePFASuiteBase[StopWordsResult]{

  val remover = new StopWordsRemover()
    .setInputCol("raw")
    .setOutputCol("filtered")

  val dataset = spark.createDataFrame(Seq(
    (0, Seq("I", "saw", "the", "red", "balloon")),
    (1, Seq("Mary", "had", "a", "little", "lamb")),
    (2, Seq("The", "the"))
  )).toDF("id", "raw")

  override val sparkTransformer = remover

  val result = sparkTransformer.transform(dataset)
  override val input = result.select(remover.getInputCol).toJSON.collect()
  override val expectedOutput = result.select(remover.getOutputCol).toJSON.collect()

  test("StopWordsRemover case sensitive") {
    val transformer = remover.setCaseSensitive(true)
    val result = transformer.transform(dataset)
    val input = result.select(remover.getInputCol).toJSON.collect()
    val expectedOutput = result.select(remover.getOutputCol).toJSON.collect()

    parityTest(transformer, input, expectedOutput)
  }
}

case class StopWordsResult(filtered: Seq[String]) extends Result 
Example 4
Source File: StopWordsRemover.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.pfa.types.WithSchema
import com.ibm.aardpfark.spark.ml.PFAModel
import com.sksamuel.avro4s.{AvroNamespace, AvroSchema}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.spark.ml.feature.StopWordsRemover

@AvroNamespace("com.ibm.aardpfark.exec.spark.spark.ml.feature")
case class StopWords(words: Seq[String]) extends WithSchema {
  def schema = AvroSchema[this.type ]
}

class PFAStopWordsRemover(override val sparkTransformer: StopWordsRemover) extends PFAModel[StopWords] {
  import com.ibm.aardpfark.pfa.dsl._

  private val inputCol = sparkTransformer.getInputCol
  private val outputCol = sparkTransformer.getOutputCol
  private val inputExpr = StringExpr(s"input.${inputCol}")

  private val stopWords = sparkTransformer.getStopWords
  private val caseSensitive = sparkTransformer.getCaseSensitive

  private def filterFn = FunctionDef[String, Boolean]("word") { w =>
    Seq(core.not(a.contains(wordsRef, if (caseSensitive) w else s.lower(w))))
  }

  override def inputSchema: Schema = {
    SchemaBuilder.record(withUid(inputBaseName)).fields()
      .name(inputCol).`type`().array().items().stringType().noDefault()
      .endRecord()
  }

  override def outputSchema: Schema =  {
    SchemaBuilder.record(withUid(outputBaseName)).fields()
      .name(outputCol).`type`().array().items().stringType().noDefault()
      .endRecord()
  }

  override protected def cell = {
    Cell(StopWords(stopWords))
  }

  private val wordsRef = modelCell.ref("words")

  override def action: PFAExpression = {
    NewRecord(outputSchema, Map(outputCol -> a.filter(inputExpr, filterFn)))
  }

  override def pfa: PFADocument =
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withCell(modelCell)
      .withAction(action)
      .pfa
} 
Example 5
package com.chapter11.SparkMachineLearning

import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StopWordsRemover

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val sentence = spark.createDataFrame(Seq(
      (0, "Tokenization,is the process of enchanting words,from the raw text"),
      (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
      (2, " Here,will provide a sample example on how to tockenize sentences"),
      (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")

    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W+")
      .setGaps(true)

    val countTokens = udf { (words: Seq[String]) => words.length }
    val regexTokenized = regexTokenizer.transform(sentence)

    val remover = new StopWordsRemover()
      .setInputCol("words")
      .setOutputCol("filtered")

    val newDF = remover.transform(regexTokenized)
    newDF.select("id", "filtered").show(false)

  }
} 
Example 6
Source File: StopWordsRemoverExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StopWordsRemover
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("StopWordsRemoverExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")

    val dataSet = sqlContext.createDataFrame(Seq(
      (0, Seq("I", "saw", "the", "red", "baloon")),
      (1, Seq("Mary", "had", "a", "little", "lamb"))
    )).toDF("id", "raw")

    remover.transform(dataSet).show()
    // $example off$
    sc.stop()
  }
}
// scalastyle:on println 
Example 7
Source File: StopWordsRemoverExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StopWordsRemover
// $example off$
import org.apache.spark.sql.SparkSession

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StopWordsRemoverExample")
      .getOrCreate()

    // $example on$
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")

    val dataSet = spark.createDataFrame(Seq(
      (0, Seq("I", "saw", "the", "red", "balloon")),
      (1, Seq("Mary", "had", "a", "little", "lamb"))
    )).toDF("id", "raw")

    remover.transform(dataSet).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: StopWordsRemoverExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StopWordsRemover
// $example off$
import org.apache.spark.sql.Row
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    //transform()方法将DataFrame转化为另外一个DataFrame的算法
    remover.transform(dataSet).show()
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: OpStopWordsRemoverTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.types._
import com.salesforce.op.utils.spark.RichDataset._
import com.salesforce.op.test.{SwTransformerSpec, TestFeatureBuilder}
import org.apache.spark.ml.feature.StopWordsRemover
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class OpStopWordsRemoverTest extends SwTransformerSpec[TextList, StopWordsRemover, OpStopWordsRemover] {
  val data = Seq(
    "I AM groot", "Groot call me human", "or I will crush you"
  ).map(_.split(" ").toSeq.toTextList)

  val (inputData, textListFeature) = TestFeatureBuilder(data)

  val bigrams = textListFeature.removeStopWords()
  val transformer = bigrams.originStage.asInstanceOf[OpStopWordsRemover]

  val expectedResult = Seq(Seq("groot"), Seq("Groot", "call", "human"), Seq("crush")).map(_.toTextList)

  it should "allow case sensitivity" in {
    val noStopWords = textListFeature.removeStopWords(caseSensitive = true)
    val res = noStopWords.originStage.asInstanceOf[OpStopWordsRemover].transform(inputData)
    res.collect(noStopWords) shouldBe Seq(
      Seq("I", "AM", "groot"), Seq("Groot", "call", "human"), Seq("I", "crush")).map(_.toTextList)
  }

  it should "set custom stop words" in {
    val noStopWords = textListFeature.removeStopWords(stopWords = Array("Groot", "I"))
    val res = noStopWords.originStage.asInstanceOf[OpStopWordsRemover].transform(inputData)
    res.collect(noStopWords) shouldBe Seq(
      Seq("AM"), Seq("call", "me", "human"), Seq("or", "will", "crush", "you")).map(_.toTextList)
  }
} 
Example 10
Source File: OpTransformerWrapperTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.specific

import com.salesforce.op.features.types._
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.feature.{Normalizer, StopWordsRemover}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class OpTransformerWrapperTest extends FlatSpec with TestSparkContext {

  val (testData, featureVector) = TestFeatureBuilder(
    Seq[MultiPickList](
      Set("I", "saw", "the", "red", "balloon").toMultiPickList,
      Set("Mary", "had", "a", "little", "lamb").toMultiPickList
    )
  )

  val (testDataNorm, _, _) = TestFeatureBuilder("label", "features",
    Seq[(Real, OPVector)](
      0.0.toReal -> Vectors.dense(1.0, 0.5, -1.0).toOPVector,
      1.0.toReal -> Vectors.dense(2.0, 1.0, 1.0).toOPVector,
      2.0.toReal -> Vectors.dense(4.0, 10.0, 2.0).toOPVector
    )
  )
  val (targetDataNorm, targetLabelNorm, featureVectorNorm) = TestFeatureBuilder("label", "features",
    Seq[(Real, OPVector)](
      0.0.toReal -> Vectors.dense(0.4, 0.2, -0.4).toOPVector,
      1.0.toReal -> Vectors.dense(0.5, 0.25, 0.25).toOPVector,
      2.0.toReal -> Vectors.dense(0.25, 0.625, 0.125).toOPVector
    )
  )

  Spec[OpTransformerWrapper[_, _, _]] should "remove stop words with caseSensitivity=true" in {
    val remover = new StopWordsRemover().setCaseSensitive(true)
    val swFilter =
      new OpTransformerWrapper[MultiPickList, MultiPickList, StopWordsRemover](remover).setInput(featureVector)
    val output = swFilter.transform(testData)

    output.collect(swFilter.getOutput()) shouldBe Array(
      Seq("I", "saw", "red", "balloon").toMultiPickList,
      Seq("Mary", "little", "lamb").toMultiPickList
    )
  }

  it should "should properly normalize each feature vector instance with non-default norm of 1" in {
    val baseNormalizer = new Normalizer().setP(1.0)
    val normalizer =
      new OpTransformerWrapper[OPVector, OPVector, Normalizer](baseNormalizer).setInput(featureVectorNorm)
    val output = normalizer.transform(testDataNorm)

    val sumSqDist = validateDataframeDoubleColumn(output, normalizer.getOutput().name, targetDataNorm, "features")
    assert(sumSqDist <= 1E-6, "==> the sum of squared distances between actual and expected should be below tolerance.")
  }

  def validateDataframeDoubleColumn(
    normalizedFeatureDF: DataFrame, normalizedFeatureName: String, targetFeatureDF: DataFrame, targetColumnName: String
  ): Double = {
    val sqDistUdf = udf { (leftColVec: Vector, rightColVec: Vector) => Vectors.sqdist(leftColVec, rightColVec) }

    val targetColRename = "targetFeatures"
    val renamedTargedDF = targetFeatureDF.withColumnRenamed(targetColumnName, targetColRename)
    val joinedDF = normalizedFeatureDF.join(renamedTargedDF, Seq("label"))

    // compute sum of squared distances between expected and actual
    val finalDF = joinedDF.withColumn("sqDist", sqDistUdf(joinedDF(normalizedFeatureName), joinedDF(targetColRename)))
    val sumSqDist: Double = finalDF.agg(sum(finalDF("sqDist"))).first().getDouble(0)
    sumSqDist
  }
} 
Example 11
Source File: StopWordsRemoverExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StopWordsRemover
// $example off$
import org.apache.spark.sql.SparkSession

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StopWordsRemoverExample")
      .getOrCreate()

    // $example on$
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")

    val dataSet = spark.createDataFrame(Seq(
      (0, Seq("I", "saw", "the", "red", "balloon")),
      (1, Seq("Mary", "had", "a", "little", "lamb"))
    )).toDF("id", "raw")

    remover.transform(dataSet).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 12
Source File: Preprocessor.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package functions

import config.paramconf.PreprocessParams
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer}
import org.apache.spark.sql.DataFrame


  def preprocess(data: DataFrame): Pipeline = {
    val spark = data.sparkSession
    val params = new PreprocessParams

    val indexModel = new StringIndexer()
      .setHandleInvalid(params.handleInvalid)
      .setInputCol("label")
      .setOutputCol("indexedLabel")
      .fit(data)

    val cleaner = new Cleaner()
      .setFanJian(params.fanjian)
      .setQuanBan(params.quanban)
      .setMinLineLen(params.minLineLen)
      .setInputCol("content")
      .setOutputCol("cleand")

    val segmenter = new Segmenter()
      .isAddNature(params.addNature)
      .isDelEn(params.delEn)
      .isDelNum(params.delNum)
      .isNatureFilter(params.natureFilter)
      .setMinTermLen(params.minTermLen)
      .setMinTermNum(params.minTermNum)
      .setSegType(params.segmentType)
      .setInputCol(cleaner.getOutputCol)
      .setOutputCol("segmented")

    val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect()
    val remover = new StopWordsRemover()
      .setStopWords(stopwords)
      .setInputCol(segmenter.getOutputCol)
      .setOutputCol("removed")

    val vectorizer = new CountVectorizer()
      .setMinTF(params.minTF)
      .setVocabSize(params.vocabSize)
      .setInputCol(remover.getOutputCol)
      .setOutputCol("vectorized")

    val idf = new IDF()
      .setMinDocFreq(params.minDocFreq)
      .setInputCol(vectorizer.getOutputCol)
      .setOutputCol("features")

    val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf)
    new Pipeline().setStages(stages)
  }
} 
Example 13
Source File: StopWordsRemoverExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StopWordsRemover
// $example off$
import org.apache.spark.sql.SparkSession

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StopWordsRemoverExample")
      .getOrCreate()

    // $example on$
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")

    val dataSet = spark.createDataFrame(Seq(
      (0, Seq("I", "saw", "the", "red", "balloon")),
      (1, Seq("Mary", "had", "a", "little", "lamb"))
    )).toDF("id", "raw")

    remover.transform(dataSet).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 14
Source File: LDAParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.clustering

import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.ml.feature.{CountVectorizer, StopWordsRemover, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.sql.DataFrame
import org.scalatest.Ignore


@Ignore
class LDAParitySpec extends SparkParityBase {
  override val dataset: DataFrame = textDataset.select("text")

  val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")

  val remover = new StopWordsRemover()
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("words_filtered")

  val cv = new CountVectorizer().setInputCol("words_filtered").setOutputCol("features").setVocabSize(50000)

  val lda = new LDA().setK(5).setMaxIter(2)

  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(tokenizer, remover, cv, lda)).fit(dataset)

  override def equalityTest(sparkDataset: DataFrame,
                            mleapDataset: DataFrame): Unit = {
    val sparkPredictionCol = sparkDataset.schema.fieldIndex("topicDistribution")
    val mleapPredictionCol = mleapDataset.schema.fieldIndex("topicDistribution")

    sparkDataset.collect().zip(mleapDataset.collect()).foreach {
      case (sv, mv) =>
        val sparkPrediction = sv.getAs[Vector](sparkPredictionCol)
        val mleapPrediction = mv.getAs[Vector](mleapPredictionCol)

        sparkPrediction.toArray.zip(mleapPrediction.toArray).foreach {
          case (s, m) => assert(Math.abs(m - s) < 0.001)
        }
    }
  }
} 
Example 15
Source File: StopWordsRemoverParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.feature

import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer}
import org.apache.spark.sql.DataFrame


class StopWordsRemoverParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("loan_title")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer().
    setInputCol("loan_title").
    setOutputCol("loan_title_tokens"),
    new StopWordsRemover().
      setInputCol("loan_title_tokens").
      setOutputCol("loan_title_stop").
      setStopWords(Array("loan")))).fit(dataset)
} 
Example 16
Source File: StopWordsRemoverOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.StopWordsRemover


class StopWordsRemoverOp extends SimpleSparkOp[StopWordsRemover] {
  override val Model: OpModel[SparkBundleContext, StopWordsRemover] = new OpModel[SparkBundleContext, StopWordsRemover] {
    override val klazz: Class[StopWordsRemover] = classOf[StopWordsRemover]

    override def opName: String = Bundle.BuiltinOps.feature.stopwords_remover

    override def store(model: Model, obj: StopWordsRemover)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("stop_words", Value.stringList(obj.getStopWords)).
        withValue("case_sensitive", Value.boolean(obj.getCaseSensitive))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): StopWordsRemover = {
      new StopWordsRemover(uid = "").setStopWords(model.value("stop_words").getStringList.toArray).
        setCaseSensitive(model.value("case_sensitive").getBoolean)
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: StopWordsRemover): StopWordsRemover = {
    new StopWordsRemover(uid = uid).setStopWords(model.getStopWords).setCaseSensitive(model.getCaseSensitive)
  }

  override def sparkInputs(obj: StopWordsRemover): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: StopWordsRemover): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 17
Source File: StopWordsRemoverWrapper.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature.preprocess

import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne}
import com.tencent.angel.spark.automl.feature.TransformerWrapper
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.feature.StopWordsRemover

class StopWordsRemoverWrapper extends TransformerWrapper {

  override val transformer: Transformer = new StopWordsRemover()
  override var parent: TransformerWrapper = _

  override val hasMultiInputs: Boolean = false
  override val hasMultiOutputs: Boolean = false
  override val needAncestorInputs: Boolean = false

  override val relation: InToOutRelation = OneToOne

  override val requiredInputCols: Array[String] = Array("words")
  override val requiredOutputCols: Array[String] = Array("stopwords")

  override def declareInAndOut(): this.type = {
    transformer.asInstanceOf[StopWordsRemover].setInputCol(getInputCols(0))
    transformer.asInstanceOf[StopWordsRemover].setOutputCol(getOutputCols(0))
    this
  }

} 
Example 18
Source File: Components.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature.preprocess

import org.apache.spark.ml.PipelineStage
import org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable.ArrayBuffer

object Components {

  def sample(data: DataFrame,
             fraction: Double): DataFrame = {
    data.sample(false, fraction)
  }

  def addSampler(components: ArrayBuffer[PipelineStage],
                 inputCol: String,
                 fraction: Double): Unit = {
    val sampler = new Sampler(fraction)
      .setInputCol("features")
    components += sampler
  }

  def addTokenizer(components: ArrayBuffer[PipelineStage],
                   inputCol: String,
                   outputCol: String): Unit = {
    val tokenizer = new Tokenizer()
      .setInputCol(inputCol)
      .setOutputCol(outputCol)
    components += tokenizer
  }

  def addStopWordsRemover(components: ArrayBuffer[PipelineStage],
                          inputCol: String,
                          outputCol: String): Unit = {
    val remover = new StopWordsRemover()
      .setInputCol(inputCol)
      .setOutputCol(outputCol)
    components += remover
  }

} 
Example 19
Source File: LocalStopWordsRemover.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.StopWordsRemover

class LocalStopWordsRemover(override val sparkTransformer: StopWordsRemover)
  extends LocalTransformer[StopWordsRemover] {
  override def transform(localData: LocalData): LocalData = {
    val stopWordsSet   = sparkTransformer.getStopWords
    val toLower        = (s: String) => if (s != null) s.toLowerCase else s
    val lowerStopWords = stopWordsSet.map(toLower(_)).toSet

    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val newData = column.data.map(r => {
          if (sparkTransformer.getCaseSensitive) {
            r.asInstanceOf[Seq[String]].filter(s => !stopWordsSet.contains(s))
          } else {
            r.asInstanceOf[Seq[String]].filter(s => !lowerStopWords.contains(toLower(s)))
          }
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalStopWordsRemover
  extends SimpleModelLoader[StopWordsRemover]
  with TypedTransformerConverter[StopWordsRemover] {

  override def build(metadata: Metadata, data: LocalData): StopWordsRemover = {
    new StopWordsRemover(metadata.uid)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      .setCaseSensitive(metadata.paramMap("caseSensitive").asInstanceOf[Boolean])
      .setStopWords(metadata.paramMap("stopWords").asInstanceOf[Seq[String]].toArray)
  }

  override implicit def toLocal(transformer: StopWordsRemover) =
    new LocalStopWordsRemover(transformer)
}