org.apache.spark.ml.feature.NGram Scala Examples

The following examples show how to use org.apache.spark.ml.feature.NGram. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: NGramExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.NGram
// $example off$
import org.apache.spark.sql.SparkSession

object NGramExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("NGramExample")
      .getOrCreate()

    // $example on$
    val wordDataFrame = spark.createDataFrame(Seq(
      (0, Array("Hi", "I", "heard", "about", "Spark")),
      (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
      (2, Array("Logistic", "regression", "models", "are", "neat"))
    )).toDF("id", "words")

    val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams")

    val ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.select("ngrams").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: LocalNGram.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.NGram

class LocalNGram(override val sparkTransformer: NGram) extends LocalTransformer[NGram] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val method = classOf[NGram].getMethod("createTransformFunc")
        val f      = method.invoke(sparkTransformer).asInstanceOf[Seq[String] => Seq[String]]
        val data = column.data.map(_.asInstanceOf[Seq[String]]).map { row =>
          f.apply(row).toList
        }
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, data))
      case None => localData
    }
  }
}

object LocalNGram extends SimpleModelLoader[NGram] with TypedTransformerConverter[NGram] {

  override def build(metadata: Metadata, data: LocalData): NGram = {
    new NGram(metadata.uid)
      .setN(metadata.paramMap("n").asInstanceOf[Number].intValue())
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
  }

  override implicit def toLocal(transformer: NGram) =
    new LocalNGram(transformer)
} 
Example 3
Source File: NGramOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.NGram


class NGramOp extends SimpleSparkOp[NGram] {
  override val Model: OpModel[SparkBundleContext, NGram] = new OpModel[SparkBundleContext, NGram] {
    override val klazz: Class[NGram] = classOf[NGram]

    override def opName: String = Bundle.BuiltinOps.feature.ngram

    override def store(model: Model, obj: NGram)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("n", Value.long(obj.getN))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): NGram = {
      new NGram(uid = "").setN(model.value("n").getLong.toInt)
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: NGram): NGram = {
    new NGram(uid = uid).setN(model.getN)
  }

  override def sparkInputs(obj: NGram): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: NGram): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 4
Source File: NGramsParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.feature

import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.feature.{NGram, Tokenizer}
import org.apache.spark.sql.DataFrame


class NGramsParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("loan_title")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer().
    setInputCol("loan_title").
    setOutputCol("loan_title_tokens"),
    new NGram().
      setInputCol("loan_title_tokens").
      setOutputCol("loan_title_ngram").
      setN(3))).fit(dataset)

} 
Example 5
Source File: NGramExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.NGram
// $example off$
import org.apache.spark.sql.SparkSession

object NGramExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("NGramExample")
      .getOrCreate()

    // $example on$
    val wordDataFrame = spark.createDataFrame(Seq(
      (0, Array("Hi", "I", "heard", "about", "Spark")),
      (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
      (2, Array("Logistic", "regression", "models", "are", "neat"))
    )).toDF("id", "words")

    val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams")

    val ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.select("ngrams").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: TextFeaturizerSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.featurize.text

import com.microsoft.ml.spark.core.schema.DatasetExtensions._
import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.feature.{NGram, Tokenizer}
import org.apache.spark.ml.util.MLReadable

class TextFeaturizerSpec extends EstimatorFuzzing[TextFeaturizer]{
  lazy val dfRaw = session
    .createDataFrame(Seq((0, "Hi I"),
                         (1, "I wish for snow today"),
                         (2, "we Cant go to the park, because of the snow!"),
                         (3, "")))
    .toDF("label", "sentence")
  lazy val dfTok = new Tokenizer()
    .setInputCol("sentence")
    .setOutputCol("tokens")
    .transform(dfRaw)
  lazy val dfNgram =
    new NGram().setInputCol("tokens").setOutputCol("ngrams").transform(dfTok)

  test("operate on sentences,tokens,or ngrams") {
    val tfRaw = new TextFeaturizer()
      .setInputCol("sentence")
      .setOutputCol("features")
      .setNumFeatures(20)
    val tfTok = new TextFeaturizer()
      .setUseTokenizer(false)
      .setInputCol("tokens")
      .setOutputCol("features")
      .setNumFeatures(20)
    val tfNgram = new TextFeaturizer()
      .setUseTokenizer(false)
      .setUseNGram(false)
      .setInputCol("ngrams")
      .setOutputCol("features")
      .setNumFeatures(20)

    val dfRaw2 = tfRaw.fit(dfRaw).transform(dfRaw)
    val dfTok2 = tfTok.fit(dfTok).transform(dfTok)
    val dfNgram2 = tfNgram.fit(dfNgram).transform(dfNgram)

    val linesRaw = dfRaw2.getSVCol("features")
    val linesTok = dfTok2.getSVCol("features")
    val linesNgram = dfNgram2.getSVCol("features")

    assert(linesRaw.length == 4)
    assert(linesTok.length == 4)
    assert(linesNgram.length == 4)
    assert(linesRaw(0)(0) == 0.9162907318741551)
    assert(linesTok(1)(9) == 0.5108256237659907)
    assert(linesNgram(2)(7) == 1.8325814637483102)
    assert(linesNgram(3)(1) == 0.0)
  }

  test("throw errors if the schema is incorrect") {
    val tfRaw = new TextFeaturizer()
      .setUseTokenizer(true)
      .setInputCol("sentence")
      .setOutputCol("features")
      .setNumFeatures(20)
    val tfTok = new TextFeaturizer()
      .setUseTokenizer(false)
      .setInputCol("tokens")
      .setOutputCol("features")
      .setNumFeatures(20)
    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens"),           dfTok)
    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("ngrams"),           dfNgram)
    assertSparkException[IllegalArgumentException](tfTok.setInputCol("sentence"),         dfRaw)
    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens_incorrect"), dfTok)
    assertSparkException[IllegalArgumentException](tfRaw.setOutputCol("tokens"),          dfTok)
  }

  override def testObjects(): Seq[TestObject[TextFeaturizer]] =
    List(new TestObject(new TextFeaturizer().setInputCol("sentence"), dfRaw))

  override def reader: MLReadable[_] = TextFeaturizer
  override def modelReader: MLReadable[_] = TextFeaturizerModel
} 
Example 7
Source File: NGramSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.ml

import com.microsoft.ml.spark.core.test.base.TestBase
import org.apache.spark.ml.feature.{NGram, Tokenizer}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable

class NGramSpec extends TestBase {

  def ngramDFToScalaList(dataFrame: DataFrame, outputCol: String = "ngrams"): Array[List[Any]] = {
    dataFrame.select(dataFrame(outputCol)).collect()
      .map(_.getAs[Seq[Any]](0).toList)
  }

  test("operation on tokenized strings") {
    val wordDataFrame = session.createDataFrame(Seq((0, Array("Hi", "I", "can", "not", "foo")),
                                                    (1, Array("I")),
                                                    (2, Array("Logistic", "regression")),
                                                    (3, Array("Log", "f", "reg"))))
      .toDF("label", "words")

    val ngramDF = new NGram().setN(3)
      .setInputCol("words").setOutputCol("ngrams")
      .transform(wordDataFrame)
    val ngrams = ngramDFToScalaList(ngramDF)
    assert(ngrams(0) === Array("Hi I can", "I can not", "can not foo"))
    assert(ngrams(1) === Array())
    assert(ngrams(2) === Array())
    assert(ngrams(3) === Array("Log f reg"))
  }

  test("supporting several values for n") {
    val ns = 1 to 6
    val words = Array("Hi", "I", "can", "not", "foo", "bar", "foo", "afk")
    val wordDataFrame = session.createDataFrame(Seq((0, words))).toDF("label", "words")
    val nGramResults = ns.map { n =>
      ngramDFToScalaList(
        new NGram().setN(n)
          .setInputCol("words").setOutputCol("ngrams")
          .transform(wordDataFrame))
      }
    ns.foreach { n =>
      assert(nGramResults(n-1)(0).head === words.take(n).mkString(" "))
    }
  }

  test("handling empty strings gracefully") {
    val wordDataFrame = session.createDataFrame(Seq((0, "hey you no way"),
                                                    (1, "")))
      .toDF("label", "sentence")

    val tokenized = new Tokenizer().setInputCol("sentence").setOutputCol("tokens").transform(wordDataFrame)
    val ngrams = new NGram().setInputCol("tokens").setOutputCol("ngrams").transform(tokenized)
    assert(ngramDFToScalaList(ngrams)(1) === Nil)
  }

  test("raise an error when applied to a null array") {
    val tokenDataFrame = session.createDataFrame(Seq(
      (0, Some(Array("Hi", "I", "can", "not", "foo"))),
      (1, None))
    ).toDF("label", "tokens")
    assertSparkException[org.apache.spark.SparkException](new NGram().setInputCol("tokens"), tokenDataFrame)
  }

  test("raise an error when given strange values of n") {
    List(0, -1, -10).foreach { n =>
      intercept[IllegalArgumentException] { new NGram().setN(n) }
    }
  }

} 
Example 8
Source File: NGramExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.NGram
// $example off$
import org.apache.spark.sql.SparkSession

object NGramExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("NGramExample")
      .getOrCreate()

    // $example on$
    val wordDataFrame = spark.createDataFrame(Seq(
      (0, Array("Hi", "I", "heard", "about", "Spark")),
      (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
      (2, Array("Logistic", "regression", "models", "are", "neat"))
    )).toDF("id", "words")

    val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams")

    val ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.select("ngrams").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: NGramTransformer.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers

import org.apache.spark.ml.feature.NGram

import ai.deepsense.deeplang.doperables.SparkTransformerAsMultiColumnTransformer
import ai.deepsense.deeplang.params.Param
import ai.deepsense.deeplang.params.validators.RangeValidator
import ai.deepsense.deeplang.params.wrappers.spark.IntParamWrapper

class NGramTransformer extends SparkTransformerAsMultiColumnTransformer[NGram] {

  val n = new IntParamWrapper[NGram](
    name = "n",
    description = Some("The minimum n-gram length."),
    sparkParamGetter = _.n,
    validator = RangeValidator(begin = 1.0, end = Int.MaxValue, step = Some(1.0)))
  setDefault(n, 2.0)

  override protected def getSpecificParams: Array[Param[_]] = Array(n)

  def setN(value: Int): this.type = {
    set(n -> value)
  }
} 
Example 10
Source File: OpNGramTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.types._
import com.salesforce.op.stages.sparkwrappers.specific.OpTransformerWrapper
import com.salesforce.op.test.{SwTransformerSpec, TestFeatureBuilder}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.feature.NGram
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class OpNGramTest extends SwTransformerSpec[TextList, NGram, OpNGram] {
  val data = Seq("a b c d e f g").map(_.split(" ").toSeq.toTextList)
  val (inputData, textListFeature) = TestFeatureBuilder(data)

  val expectedResult = Seq(Seq("a b", "b c", "c d", "d e", "e f", "f g").toTextList)

  val bigrams = textListFeature.ngram()
  val transformer = bigrams.originStage.asInstanceOf[OpNGram]

  it should "generate unigrams" in {
    val unigrams = textListFeature.ngram(n = 1)
    val transformedData = unigrams.originStage.asInstanceOf[Transformer].transform(inputData)
    val results = transformedData.collect(unigrams)

    results(0) shouldBe data.head
  }

  it should "generate trigrams" in {
    val trigrams = textListFeature.ngram(n = 3)
    val transformedData = trigrams.originStage.asInstanceOf[Transformer].transform(inputData)
    val results = transformedData.collect(trigrams)

    results(0) shouldBe Seq("a b c", "b c d", "c d e", "d e f", "e f g").toTextList
  }

  it should "not allow n < 1" in {
    the[IllegalArgumentException] thrownBy textListFeature.ngram(n = 0)
    the[IllegalArgumentException] thrownBy textListFeature.ngram(n = -1)
  }

} 
Example 11
Source File: NGramExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.NGram
// $example off$
import org.apache.spark.sql.SparkSession

object NGramExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("NGramExample")
      .getOrCreate()

    // $example on$
    val wordDataFrame = spark.createDataFrame(Seq(
      (0, Array("Hi", "I", "heard", "about", "Spark")),
      (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
      (2, Array("Logistic", "regression", "models", "are", "neat"))
    )).toDF("id", "words")

    val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams")

    val ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.select("ngrams").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 12
Source File: NGramExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.NGram
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object NGramExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("NGramExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    val wordDataFrame = sqlContext.createDataFrame(Seq(
      (0, Array("Hi", "I", "heard", "about", "Spark")),
      (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
      (2, Array("Logistic", "regression", "models", "are", "neat"))
    )).toDF("label", "words")

    val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
    val ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
    // $example off$
    sc.stop()
  }
}
// scalastyle:on println 
Example 13
package org.googlielmo.sparknlpbench

import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.util.Benchmark
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.NGram
import org.apache.spark.sql.SparkSession

object TokenizerWithNGram {
  def main(args: Array[String]): Unit = {
    val sparkSession: SparkSession = SparkSession
    .builder()
    .appName("Tokenize with n-gram example")
    .master("local[*]")
    .config("spark.driver.memory", "1G")
    .config("spark.kryoserializer.buffer.max","200M")
    .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()

    import sparkSession.implicits._
    sparkSession.sparkContext.setLogLevel("WARN")
  
    val document = new DocumentAssembler()
      .setInputCol("text")
      .setOutputCol("document")
  
    val token = new Tokenizer()
      .setInputCols("document")
      .setOutputCol("token")
  
    val normalizer = new Normalizer()
      .setInputCols("token")
      .setOutputCol("normal")
  
    val finisher = new Finisher()
      .setInputCols("normal")
  
    val ngram = new NGram()
      .setN(3)
      .setInputCol("finished_normal")
      .setOutputCol("3-gram")
  
    val gramAssembler = new DocumentAssembler()
      .setInputCol("3-gram")
      .setOutputCol("3-grams")
  
    val pipeline = new Pipeline().setStages(Array(document, token, normalizer, finisher, ngram, gramAssembler))
  
    val testing = Seq(
      (1, "Packt is a famous publishing company"),
      (2, "Guglielmo is an author")
    ).toDS.toDF( "_id", "text")
  
    val result = pipeline.fit(Seq.empty[String].toDS.toDF("text")).transform(testing)
    Benchmark.time("Time to convert and show") {result.show(truncate=false)}
    
    sparkSession.stop
  }
} 
Example 14
Source File: NGram.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.document.{PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.expression.{PFAExpression, PartialFunctionRef}
import com.ibm.aardpfark.spark.ml.PFATransformer
import org.apache.avro.SchemaBuilder

import org.apache.spark.ml.feature.NGram


class PFANGram(override val sparkTransformer: NGram) extends PFATransformer {
  import com.ibm.aardpfark.pfa.dsl._

  private val inputCol = sparkTransformer.getInputCol
  private val outputCol = sparkTransformer.getOutputCol
  private val inputExpr = StringExpr(s"input.${inputCol}")

  private val n = sparkTransformer.getN

  override def inputSchema = {
    SchemaBuilder.record(withUid(inputBaseName)).fields()
      .name(inputCol).`type`().array().items().stringType().noDefault()
      .endRecord()
  }

  override def outputSchema = {
    SchemaBuilder.record(withUid(outputBaseName)).fields()
      .name(outputCol).`type`().array().items().stringType().noDefault()
      .endRecord()
  }

  override def action: PFAExpression = {
    // TODO - this partial fn reference is an ugly workaround for now - add support for builtin lib
    val partialFn = new PartialFunctionRef("s.join", Seq(("sep", " ")))
    val mapExpr = a.map(a.slidingWindow(inputExpr, n, 1), partialFn)
    NewRecord(outputSchema, Map(outputCol -> mapExpr))
  }

  override def pfa: PFADocument = {
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withAction(action)
      .pfa
  }
} 
Example 15
Source File: NGramSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase}

import org.apache.spark.ml.feature.NGram


class NGramSuite extends SparkFeaturePFASuiteBase[NGramResult] {

  val data = spark.createDataFrame(Seq(
    (0, Array("Hi", "I", "heard", "about", "Spark")),
    (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
    (2, Array("Logistic", "regression", "models", "are", "neat")),
    (3, Array("Logistic", "regression")),
    (4, Array("Logistic")),
    (5, Array[String]())
  )).toDF("id", "words")

  override val sparkTransformer = new NGram()
    .setInputCol("words")
    .setOutputCol("ngrams")

  val result = sparkTransformer.transform(data)
  override val input = result.select(sparkTransformer.getInputCol).toJSON.collect()
  override val expectedOutput = result.select(sparkTransformer.getOutputCol).toJSON.collect()

  test("ngrams = 1") {
    val transformer = new NGram()
      .setInputCol("words")
      .setOutputCol("ngrams")
      .setN(1)

    val result = transformer.transform(data)
    val input = result.select(sparkTransformer.getInputCol).toJSON.collect()
    val expectedOutput = result.select(sparkTransformer.getOutputCol).toJSON.collect()
    parityTest(transformer, input, expectedOutput)
  }

  test("ngrams = 3") {
    val transformer = new NGram()
      .setInputCol("words")
      .setOutputCol("ngrams")
      .setN(3)

    val result = transformer.transform(data)
    val input = result.select(sparkTransformer.getInputCol).toJSON.collect()
    val expectedOutput = result.select(sparkTransformer.getOutputCol).toJSON.collect()
    parityTest(transformer, input, expectedOutput)
  }
}

case class NGramResult(ngrams: Seq[String]) extends Result 
Example 16
Source File: NGramTransformer.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.spark.wrappers.transformers

import org.apache.spark.ml.feature.NGram

import io.deepsense.deeplang.doperables.SparkTransformerAsMultiColumnTransformer
import io.deepsense.deeplang.params.Param
import io.deepsense.deeplang.params.validators.RangeValidator
import io.deepsense.deeplang.params.wrappers.spark.IntParamWrapper

class NGramTransformer extends SparkTransformerAsMultiColumnTransformer[NGram] {

  val n = new IntParamWrapper[NGram](
    name = "n",
    description = Some("The minimum n-gram length."),
    sparkParamGetter = _.n,
    validator = RangeValidator(begin = 1.0, end = Int.MaxValue, step = Some(1.0)))
  setDefault(n, 2.0)

  override protected def getSpecificParams: Array[Param[_]] = Array(n)

  def setN(value: Int): this.type = {
    set(n -> value)
  }
}