org.apache.spark.ml.param.BooleanParam Scala Examples

The following examples show how to use org.apache.spark.ml.param.BooleanParam. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: DCT.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.DataType


  @Since("1.5.0")
  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
} 
Example 2
Source File: BinaryVectorizer.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.sequence.SequenceTransformer
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.BooleanParam

import scala.collection.mutable.ArrayBuffer


class BinaryVectorizer
(
  uid: String = UID[BinaryVectorizer]
) extends SequenceTransformer[Binary, OPVector](operationName = "vecBin", uid = uid)
  with VectorizerDefaults with TrackNullsParam {

  final val fillValue = new BooleanParam(
    parent = this, name = "fillValue", doc = "value to replace nulls"
  )
  setDefault(fillValue, false)

  def setFillValue(v: Boolean): this.type = set(fillValue, v)

  override def transformFn: (Seq[Binary]) => OPVector = in => {
    val (isTrackNulls, theFillValue) = ($(trackNulls), $(fillValue))
    val arr = ArrayBuffer.empty[Double]

    if (isTrackNulls) {
      in.foreach(_.value match {
        case None => arr.append(theFillValue, 1.0)
        case Some(v) => arr.append(v, 0.0)
      })
    } else in.foreach(b => arr.append(b.value.getOrElse[Boolean](theFillValue)))

    Vectors.dense(arr.toArray).toOPVector
  }

  override def onGetMetadata(): Unit = {
    super.onGetMetadata()
    if ($(trackNulls)) {
      setMetadata(vectorMetadataWithNullIndicators.toMetadata)
    } else {
      setMetadata(vectorMetadataFromInputFeatures.toMetadata)
    }

  }
} 
Example 3
Source File: HanLPTokenizer.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import java.util

import com.hankcs.hanlp.HanLP
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary
import com.hankcs.hanlp.seg.common.Term
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.{BooleanParam, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types._

import scala.collection.JavaConverters._

class HanLPTokenizer(override val uid: String)
  extends UnaryTransformer[String, Seq[String], HanLPTokenizer] with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("hanLPTokenizer"))
  }

  val shouldRemoveStopWords = new BooleanParam(this, "shouldRemoveStopWords", "Whether to remove stop words")

  def getShouldRemoveStopWords: Boolean = $(shouldRemoveStopWords)

  def setShouldRemoveStopWords(value: Boolean): this.type = set(shouldRemoveStopWords, value)
  setDefault(shouldRemoveStopWords -> true)

  override def createTransformFunc: String => Seq[String] = { originStr =>
    HanLP.Config.ShowTermNature = false
    HanLP.Config.Normalization = false
    val segment = HanLP.newSegment()
    val termList: util.List[Term] = segment.seg(HanLP.convertToSimplifiedChinese(originStr.toLowerCase))

    if ($(shouldRemoveStopWords)) {
      CoreStopWordDictionary.apply(termList)
    }

    val LanguageRE = """(c|r|c\+\+|c#|f#)""".r
    val OneCharExceptCJKRE = """([^\p{InHiragana}\p{InKatakana}\p{InBopomofo}\p{InCJKCompatibilityIdeographs}\p{InCJKUnifiedIdeographs}])""".r
    termList
      .asScala
      .flatMap((term: Term) => {
        val word = term.word
        word match {
          case LanguageRE(language) => Array(language)
          case OneCharExceptCJKRE(_) => Array.empty[String]
          case _ => """([\w\.\-_\p{InHiragana}\p{InKatakana}\p{InBopomofo}\p{InCJKCompatibilityIdeographs}\p{InCJKUnifiedIdeographs}]+)""".r.findAllIn(word).toList
        }
      })
  }

  override def validateInputType(inputType: DataType): Unit = {
    require(inputType == StringType, s"Input type must be string type but got $inputType.")
  }

  override def outputDataType: DataType = {
    new ArrayType(StringType, false)
  }

  override def copy(extra: ParamMap): HanLPTokenizer = {
    defaultCopy(extra)
  }
}

object HanLPTokenizer extends DefaultParamsReadable[HanLPTokenizer] 
Example 4
Source File: DCT.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.sql.types.DataType


  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
} 
Example 5
Source File: DCT.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.DataType


  @Since("1.5.0")
  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
} 
Example 6
Source File: RegressionEvaluator.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl


class RegressionEvaluator(override val uid: String) extends Evaluator[RegressionEvaluator](uid) {

  val throughOrigin = new BooleanParam(this, "throughOrigin",
    "True if the regression is through the origin. For example, in " +
      "linear regression, it will be true without fitting intercept.")

  def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value)

  def getThroughOrigin: Boolean = $(throughOrigin)

  def this() = this(Identifiable.randomUID("regressionEvaluator"))


  override def transform(dataset: Dataset[_]): DataFrame = {

    try {
      val predictions: RDD[(Double, Double)] = dataset.select($(predictionCol), $(labelCol))
        .rdd.map { case Row(score: Double, label: Double) => (score, label) }

      val metrics = Try(new RegressionMetrics(predictions))


      val rows = metrics.toOption.map(m => Seq(
        "r2" -> m.r2,
        "rmse" -> m.rootMeanSquaredError,
        "explainedVariance" -> m.explainedVariance,
        "meanAbsoluteError" -> m.meanAbsoluteError,
        "meanSquaredError" -> m.meanSquaredError
      ).map(Row.fromTuple)).getOrElse(Seq())

      SparkSqlUtils.reflectionLock.synchronized(
        dataset.sqlContext.createDataFrame(
          dataset.sparkSession.sparkContext.parallelize(rows, 1), transformSchema(dataset.schema)))
    } catch {
      // Most probably evaluation dataset is empty
      case e: Exception =>
        logWarning("Failed to calculate metrics due to " + e.getMessage)
        SparkSqlUtils.reflectionLock.synchronized(
          dataset.sqlContext.createDataFrame(
            dataset.sparkSession.sparkContext.emptyRDD[Row], transformSchema(dataset.schema)))
    }
  }

  override def copy(extra: ParamMap): RegressionEvaluator = {
    copyValues(new RegressionEvaluator(), extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    new StructType()
      .add("metric", StringType, nullable = false)
      .add("value", DoubleType, nullable = false)
  }
} 
Example 7
Source File: XGBoostUtils.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package ml.dmlc.xgboost4j.scala.spark

import ml.dmlc.xgboost4j.scala.Booster
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.{BooleanParam, Params}
import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
import org.apache.spark.sql.{Dataset, functions}



object XGBoostUtils {
  def getBooster(x: XGBoostClassificationModel): Booster = x._booster

  def getBooster(x: XGBoostRegressionModel): Booster = x._booster
}

trait OkXGBoostParams  extends HasFeaturesCol with HasPredictionCol {
  this: Params =>

  val densifyInput = new BooleanParam(this, "densifyInput",
    "In order to fix the difference between spark abd xgboost sparsity treatment")
  val predictAsDouble = new BooleanParam(this, "predictAsDouble",
    "Whenver to cast XGBoost prediction to double matching common behavior for other predictors.")
  val addRawTrees = new BooleanParam(this, "addRawTrees",
    "Whenever to add raw trees block to model summary.")
  val addSignificance = new BooleanParam(this, "addSignificance",
    "Whenever to add feature significance block to model summary.")

  def setAddSignificance(value: Boolean): this.type = set(addSignificance, value)

  def setAddRawTrees(value: Boolean): this.type = set(addRawTrees, value)

  def setDensifyInput(value: Boolean): this.type = set(densifyInput, value)

  def setPredictAsDouble(value: Boolean): this.type = set(predictAsDouble, value)

  protected def densifyIfNeeded(dataset: Dataset[_]) : Dataset[_] = {
    if ($(densifyInput)) {
      val densify = functions.udf((x: Vector) => x.toDense)
      val col = getFeaturesCol
      val metadata = dataset.schema(col).metadata

      dataset.withColumn(
        col,
        densify(dataset(col)).as(col, metadata))
    } else {
      dataset
    }
  }
}

trait OkXGBoostClassifierParams extends XGBoostClassifierParams with OkXGBoostParams

trait OkXGBoostRegressorParams extends XGBoostRegressorParams with OkXGBoostParams 
Example 8
Source File: DCT.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.sql.types.DataType


  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
} 
Example 9
Source File: IntegralVectorizer.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.sequence.{SequenceEstimator, SequenceModel}
import com.salesforce.op.utils.spark.SequenceAggregators
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.{BooleanParam, LongParam}
import org.apache.spark.sql.Dataset

import scala.reflect.runtime.universe.TypeTag


class IntegralVectorizer[T <: Integral]
(
  uid: String = UID[IntegralVectorizer[_]],
  operationName: String = "vecInt"
) (implicit tti: TypeTag[T], ttiv: TypeTag[T#Value])
  extends SequenceEstimator[T, OPVector](operationName = operationName, uid = uid)
  with VectorizerDefaults with TrackNullsParam {

  final val fillValue = new LongParam(this, "fillValue", "default value for FillWithConstant")
  setDefault(fillValue, 0L)

  final val withConstant = new BooleanParam(this, "fillWithConstant",
    "boolean to check if filling the nulls with a constant value")
  setDefault(withConstant, true)

  def setFillWithConstant(value: Long): this.type = {
    set(fillValue, value)
    set(withConstant, true)
  }
  def setFillWithMode: this.type = set(withConstant, false)

  private def constants(): Seq[Long] = {
    val size = getInputFeatures().length
    val defValue = $(fillValue)
    val constants = List.fill(size)(defValue)
    constants
  }

  private def mode(dataset: Dataset[Seq[T#Value]]): Seq[Long] = {
    val size = getInputFeatures().length
    dataset.select(SequenceAggregators.ModeSeqNullInt(size = size).toColumn).first()
  }

  def fitFn(dataset: Dataset[Seq[T#Value]]): SequenceModel[T, OPVector] = {
    if ($(trackNulls)) setMetadata(vectorMetadataWithNullIndicators.toMetadata)

    val fillValues = if ($(withConstant)) constants() else mode(dataset)

    new IntegralVectorizerModel(
      fillValues = fillValues, trackNulls = $(trackNulls), operationName = operationName, uid = uid)

  }

}

final class IntegralVectorizerModel[T <: Integral] private[op]
(
  val fillValues: Seq[Long],
  val trackNulls: Boolean,
  operationName: String,
  uid: String
)(implicit tti: TypeTag[T])
  extends SequenceModel[T, OPVector](operationName = operationName, uid = uid)
    with VectorizerDefaults {

  def transformFn: Seq[T] => OPVector = row => {
    val replaced = if (!trackNulls) {
      row.zip(fillValues).
        map { case (i, m) => i.value.getOrElse(m).toDouble }
    }
    else {
      row.zip(fillValues).
        flatMap { case (i, m) => i.value.getOrElse(m).toDouble :: booleanToDouble(i.value.isEmpty) :: Nil }
    }
    Vectors.dense(replaced.toArray).toOPVector
  }

} 
Example 10
Source File: RealVectorizer.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.sequence.{SequenceEstimator, SequenceModel}
import com.salesforce.op.utils.spark.SequenceAggregators
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.{BooleanParam, DoubleParam}
import org.apache.spark.sql.Dataset

import scala.reflect.runtime.universe.TypeTag


class RealVectorizer[T <: Real]
(
  uid: String = UID[RealVectorizer[_]],
  operationName: String = "vecReal"
)(implicit tti: TypeTag[T], ttiv: TypeTag[T#Value])
  extends SequenceEstimator[T, OPVector](operationName = operationName, uid = uid)
    with VectorizerDefaults with TrackNullsParam {

  final val fillValue = new DoubleParam(this, "fillValue", "default value for FillWithConstant")
  setDefault(fillValue, 0.0)

  final val withConstant = new BooleanParam(this, "fillWithConstant",
    "boolean to check if filling the nulls with a constant value")

  setDefault(withConstant, true)


  def setFillWithConstant(value: Double): this.type = {
    set(fillValue, value)
    set(withConstant, true)
  }

  def setFillWithMean: this.type = {
    set(withConstant, false)
  }

  private def constants(): Seq[Double] = {
    val size = getInputFeatures().length
    val defValue = $(fillValue)
    val constants = List.fill(size)(defValue)
    constants
  }

  private def means(dataset: Dataset[Seq[T#Value]]): Seq[Double] = {
    val size = getInputFeatures().length
    val means = dataset.select(SequenceAggregators.MeanSeqNullNum(size = size).toColumn).first()
    means
  }

  def fitFn(dataset: Dataset[Seq[T#Value]]): SequenceModel[T, OPVector] = {
    if ($(trackNulls)) setMetadata(vectorMetadataWithNullIndicators.toMetadata)

    val fillValues = if ($(withConstant)) constants() else means(dataset)
    new RealVectorizerModel[T](
      fillValues = fillValues, trackNulls = $(trackNulls), operationName = operationName, uid = uid)
  }

}

final class RealVectorizerModel[T <: Real] private[op]
(
  val fillValues: Seq[Double],
  val trackNulls: Boolean,
  operationName: String,
  uid: String
)(implicit tti: TypeTag[T])
  extends SequenceModel[T, OPVector](operationName = operationName, uid = uid)
    with VectorizerDefaults {

  def transformFn: Seq[T] => OPVector = row => {
    val replaced =
      if (!trackNulls) {
        row.zip(fillValues).
          map { case (r, m) => r.value.getOrElse(m) }
      }
      else {
        row.zip(fillValues).
          flatMap { case (r, m) => r.value.getOrElse(m) :: booleanToDouble(r.isEmpty) :: Nil }
      }
    Vectors.dense(replaced.toArray).toOPVector
  }

} 
Example 11
Source File: DCT.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.DataType


  @Since("1.5.0")
  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
} 
Example 12
Source File: TokenAssembler.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import com.johnsnowlabs.nlp.annotators.common.NerTagged
import com.johnsnowlabs.nlp.annotators.ner.NerTagsEncoding
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

import scala.collection.mutable.ArrayBuffer

class TokenAssembler(override val uid: String) extends AnnotatorModel[TokenAssembler] {

  import com.johnsnowlabs.nlp.AnnotatorType._

  override val outputAnnotatorType: AnnotatorType = DOCUMENT

  override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT, TOKEN)

  val preservePosition: BooleanParam = new BooleanParam(this, "preservePosition", "Whether to preserve the actual position of the tokens or reduce them to one space")

  def setPreservePosition(value: Boolean): this.type = set(preservePosition, value)

  setDefault(
    preservePosition -> false
  )

  def this() = this(Identifiable.randomUID("TOKEN_ASSEMBLER"))

  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {

    val result = ArrayBuffer[Annotation]()

    val sentences_init = annotations.filter(_.annotatorType == AnnotatorType.DOCUMENT)


    sentences_init.zipWithIndex.foreach { case (sentence, sentenceIndex) =>

      val tokens = annotations.filter(token =>
        token.annotatorType == AnnotatorType.TOKEN &&
          token.begin >= sentence.begin &&
          token.end <= sentence.end)

      var fullSentence: String = ""
      var lastEnding: Int = 0

      tokens.foreach { case (token) =>
        if (token.begin > lastEnding && token.begin - lastEnding != 1 && lastEnding != 0) {
          if ($(preservePosition)) {
            val tokenBreaks = sentence.result.substring(lastEnding + 1 - sentence.begin, token.begin - sentence.begin)
            val matches = ("[\\r\\t\\f\\v\\n ]+".r).findAllIn(tokenBreaks).mkString
            if (matches.length > 0) {
              fullSentence = fullSentence ++ matches ++ token.result
            } else {
              fullSentence = fullSentence ++ " " ++ token.result
            }
          } else {
            fullSentence = fullSentence ++ " " ++ token.result
          }
        } else {
          fullSentence = fullSentence ++ token.result
        }
        lastEnding = token.end
        fullSentence
      }

      val beginIndex = sentence.begin
      val endIndex = fullSentence.length - 1

      val annotation = Annotation(
        DOCUMENT,
        beginIndex,
        beginIndex + endIndex,
        fullSentence,
        Map("sentence" -> sentenceIndex.toString)
      )

      result.append(annotation)
    }
    result
  }

}

object TokenAssembler extends DefaultParamsReadable[TokenAssembler] 
Example 13
Source File: NerConverter.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.ner

import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN}
import com.johnsnowlabs.nlp.annotators.common.NerTagged
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, ParamsAndFeaturesReadable}
import org.apache.spark.ml.param.{BooleanParam, StringArrayParam}
import org.apache.spark.ml.util.Identifiable

import scala.collection.Map


  def setPreservePosition(value: Boolean): this.type = set(preservePosition, value)

  setDefault(
    preservePosition -> true
  )

  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val sentences = NerTagged.unpack(annotations)
    val docs = annotations.filter(a => a.annotatorType == AnnotatorType.DOCUMENT)
    val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) =>
      NerTagsEncoding.fromIOB(sentence, doc._1, sentenceIndex=doc._2, $(preservePosition))
    }

    entities.filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity))).
      zipWithIndex.map{case (entity, idx) =>
      Annotation(
        outputAnnotatorType,
        entity.start,
        entity.end,
        entity.text,
        Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString)
      )
    }
  }

}

object NerConverter extends ParamsAndFeaturesReadable[NerConverter] 
Example 14
Source File: BigTextMatcher.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.btm

import com.johnsnowlabs.collections.StorageSearchTrie
import com.johnsnowlabs.nlp.AnnotatorType.{TOKEN, DOCUMENT, CHUNK}
import com.johnsnowlabs.nlp.annotators.TokenizerModel
import com.johnsnowlabs.nlp.serialization.StructFeature
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.storage.Database.Name
import com.johnsnowlabs.storage.{Database, HasStorage, RocksDBConnection, StorageWriter}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset

class BigTextMatcher(override val uid: String) extends AnnotatorApproach[BigTextMatcherModel] with HasStorage {

  def this() = this(Identifiable.randomUID("ENTITY_EXTRACTOR"))

  override val inputAnnotatorTypes = Array(DOCUMENT, TOKEN)

  override val outputAnnotatorType: AnnotatorType = CHUNK

  override val description: String = "Extracts entities from target dataset given in a text file"

  val mergeOverlapping = new BooleanParam(this, "mergeOverlapping", "whether to merge overlapping matched chunks. Defaults false")
  val tokenizer = new StructFeature[TokenizerModel](this, "tokenizer")

  setDefault(inputCols,Array(TOKEN))
  setDefault(caseSensitive, true)
  setDefault(mergeOverlapping, false)

  def setTokenizer(tokenizer: TokenizerModel): this.type = set(this.tokenizer, tokenizer)

  def getTokenizer: TokenizerModel = $$(tokenizer)

  def setMergeOverlapping(v: Boolean): this.type = set(mergeOverlapping, v)

  def getMergeOverlapping: Boolean = $(mergeOverlapping)

  
  private def loadEntities(path: String, writers: Map[Database.Name, StorageWriter[_]]): Unit = {
    val inputFiles: Seq[Iterator[String]] =
      ResourceHelper.parseLinesIterator(ExternalResource(path, ReadAs.TEXT, Map()))
    inputFiles.foreach { inputFile => {
      StorageSearchTrie.load(inputFile, writers, get(tokenizer))
    }}
  }

  override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): BigTextMatcherModel = {
    new BigTextMatcherModel()
      .setInputCols($(inputCols))
      .setOutputCol($(outputCol))
      .setCaseSensitive($(caseSensitive))
      .setStorageRef($(storageRef))
      .setMergeOverlapping($(mergeOverlapping))
  }

  override protected def createWriter(database: Name, connection: RocksDBConnection): StorageWriter[_] = {
    database match {
      case Database.TMVOCAB => new TMVocabReadWriter(connection, $(caseSensitive))
      case Database.TMEDGES => new TMEdgesReadWriter(connection, $(caseSensitive))
      case Database.TMNODES => new TMNodesWriter(connection)
    }
  }

  override protected def index(
                                fitDataset: Dataset[_],
                                storageSourcePath: Option[String],
                                readAs: Option[ReadAs.Value],
                                writers: Map[Database.Name, StorageWriter[_]],
                                readOptions: Option[Map[String, String]]
                              ): Unit = {
    require(readAs.get == ReadAs.TEXT, "BigTextMatcher only supports TEXT input formats at the moment.")
    loadEntities(storageSourcePath.get, writers)
  }

  override protected val databases: Array[Name] = BigTextMatcherModel.databases
}

object BigTextMatcher extends DefaultParamsReadable[BigTextMatcher] 
Example 15
Source File: HasEmbeddingsProperties.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.embeddings

import com.johnsnowlabs.nlp.AnnotatorType
import org.apache.spark.ml.param.{BooleanParam, IntParam, Params}
import org.apache.spark.sql.Column
import org.apache.spark.sql.types.MetadataBuilder

trait HasEmbeddingsProperties extends Params {

  val dimension = new IntParam(this, "dimension", "Number of embedding dimensions")

  def setDimension(value: Int): this.type = set(this.dimension, value)
  def getDimension: Int = $(dimension)

  protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS)
    metadataBuilder.putLong("dimension", embeddingsDim.toLong)
    embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
    col.as(col.toString, metadataBuilder.build)
  }

  protected def wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", AnnotatorType.SENTENCE_EMBEDDINGS)
    metadataBuilder.putLong("dimension", embeddingsDim.toLong)
    embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
    col.as(col.toString, metadataBuilder.build)
  }

} 
Example 16
Source File: DCT.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.DataType


  @Since("1.5.0")
  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
} 
Example 17
Source File: VParams.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamValidators, Params}

private trait VParams extends Params{
  // column number of each block in feature block matrix
  val colsPerBlock: IntParam = new IntParam(this, "colsPerBlock",
    "column number of each block in feature block matrix.", ParamValidators.gt(0))
  setDefault(colsPerBlock -> 10000)

  def getColsPerBlock: Int = $(colsPerBlock)

  // row number of each block in feature block matrix
  val rowsPerBlock: IntParam = new IntParam(this, "rowsPerBlock",
    "row number of each block in feature block matrix.", ParamValidators.gt(0))
  setDefault(rowsPerBlock -> 10000)

  def getRowsPerBlock: Int = $(rowsPerBlock)

  // row partition number of feature block matrix
  // equals to partition number of coefficient vector
  val rowPartitions: IntParam = new IntParam(this, "rowPartitions",
    "row partition number of feature block matrix.", ParamValidators.gt(0))
  setDefault(rowPartitions -> 10)

  def getRowPartitions: Int = $(rowPartitions)

  // column partition number of feature block matrix
  val colPartitions: IntParam = new IntParam(this, "colPartitions",
    "column partition number of feature block matrix.", ParamValidators.gt(0))
  setDefault(colPartitions -> 10)

  def getColPartitions: Int = $(colPartitions)

  // Whether to eager persist distributed vector.
  val eagerPersist: BooleanParam = new BooleanParam(this, "eagerPersist",
    "Whether to eager persist distributed vector.")
  setDefault(eagerPersist -> false)

  def getEagerPersist: Boolean = $(eagerPersist)

  // The number of corrections used in the LBFGS update.
  val numCorrections: IntParam = new IntParam(this, "numCorrections",
    "The number of corrections used in the LBFGS update.")
  setDefault(numCorrections -> 10)

  def getNumCorrections: Int = $(numCorrections)

  val generatingFeatureMatrixBuffer: IntParam = new IntParam(this, "generatingFeatureMatrixBuffer",
    "Buffer size when generating features block matrix.")
  setDefault(generatingFeatureMatrixBuffer -> 1000)

  def getGeneratingFeatureMatrixBuffer: Int = $(generatingFeatureMatrixBuffer)

  val rowPartitionSplitNumOnGeneratingFeatureMatrix: IntParam = new IntParam(this,
    "rowPartitionSplitsNumOnGeneratingFeatureMatrix",
    "row partition splits number on generating features matrix."
  )
  setDefault(rowPartitionSplitNumOnGeneratingFeatureMatrix -> 1)

  def getRowPartitionSplitNumOnGeneratingFeatureMatrix: Int =
    $(rowPartitionSplitNumOnGeneratingFeatureMatrix)

  val compressFeatureMatrix: BooleanParam = new BooleanParam(this,
    "compressFeatureMatrix",
    "compress feature matrix."
  )
  setDefault(compressFeatureMatrix -> false)

  def getCompressFeatureMatrix: Boolean = $(compressFeatureMatrix)
}