org.apache.spark.ml.param.IntParam Scala Examples

The following examples show how to use org.apache.spark.ml.param.IntParam. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MNISTBenchmark.scala    From spark-knn   with Apache License 2.0 6 votes vote down vote up
package com.github.saurfang.spark.ml.knn.examples

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.log4j

import scala.collection.mutable


object MNISTBenchmark {

  val logger = log4j.Logger.getLogger(getClass)

  def main(args: Array[String]) {
    val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt)
    val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2"
    val numPartitions = if(args.length >= 3) args(2).toInt else 10
    val models = if(args.length >=4) args(3).split(',') else Array("tree","naive")

    val spark = SparkSession.builder().getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._

    //read in raw label and features
    val rawDataset = MLUtils.loadLibSVMFile(sc, path)
      .zipWithIndex()
      .filter(_._2 < ns.max)
      .sortBy(_._2, numPartitions = numPartitions)
      .keys
      .toDF()

    // convert "features" from mllib.linalg.Vector to ml.linalg.Vector
    val dataset =  MLUtils.convertVectorColumnsToML(rawDataset)
      .cache()
    dataset.count() //force persist

    val limiter = new Limiter()
    val knn = new KNNClassifier()
      .setTopTreeSize(numPartitions * 10)
      .setFeaturesCol("features")
      .setPredictionCol("prediction")
      .setK(1)
    val naiveKNN = new NaiveKNNClassifier()

    val pipeline = new Pipeline()
      .setStages(Array(limiter, knn))
    val naivePipeline = new Pipeline()
      .setStages(Array(limiter, naiveKNN))

    val paramGrid = new ParamGridBuilder()
      .addGrid(limiter.n, ns)
      .build()

    val bm = new Benchmarker()
      .setEvaluator(new MulticlassClassificationEvaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumTimes(3)

    val metrics = mutable.ArrayBuffer[String]()
    if(models.contains("tree")) {
      val bmModel = bm.setEstimator(pipeline).fit(dataset)
      metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}"
    }
    if(models.contains("naive")) {
      val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset)
      metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}"
    }
    logger.info(metrics.mkString("\n"))
  }
}

class Limiter(override val uid: String) extends Transformer {
  def this() = this(Identifiable.randomUID("limiter"))

  val n: IntParam = new IntParam(this, "n", "number of rows to limit")

  def setN(value: Int): this.type = set(n, value)

  // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN)
  override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF()

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = schema
} 
Example 2
Source File: VParams.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamValidators, Params}

private trait VParams extends Params{
  // column number of each block in feature block matrix
  val colsPerBlock: IntParam = new IntParam(this, "colsPerBlock",
    "column number of each block in feature block matrix.", ParamValidators.gt(0))
  setDefault(colsPerBlock -> 10000)

  def getColsPerBlock: Int = $(colsPerBlock)

  // row number of each block in feature block matrix
  val rowsPerBlock: IntParam = new IntParam(this, "rowsPerBlock",
    "row number of each block in feature block matrix.", ParamValidators.gt(0))
  setDefault(rowsPerBlock -> 10000)

  def getRowsPerBlock: Int = $(rowsPerBlock)

  // row partition number of feature block matrix
  // equals to partition number of coefficient vector
  val rowPartitions: IntParam = new IntParam(this, "rowPartitions",
    "row partition number of feature block matrix.", ParamValidators.gt(0))
  setDefault(rowPartitions -> 10)

  def getRowPartitions: Int = $(rowPartitions)

  // column partition number of feature block matrix
  val colPartitions: IntParam = new IntParam(this, "colPartitions",
    "column partition number of feature block matrix.", ParamValidators.gt(0))
  setDefault(colPartitions -> 10)

  def getColPartitions: Int = $(colPartitions)

  // Whether to eager persist distributed vector.
  val eagerPersist: BooleanParam = new BooleanParam(this, "eagerPersist",
    "Whether to eager persist distributed vector.")
  setDefault(eagerPersist -> false)

  def getEagerPersist: Boolean = $(eagerPersist)

  // The number of corrections used in the LBFGS update.
  val numCorrections: IntParam = new IntParam(this, "numCorrections",
    "The number of corrections used in the LBFGS update.")
  setDefault(numCorrections -> 10)

  def getNumCorrections: Int = $(numCorrections)

  val generatingFeatureMatrixBuffer: IntParam = new IntParam(this, "generatingFeatureMatrixBuffer",
    "Buffer size when generating features block matrix.")
  setDefault(generatingFeatureMatrixBuffer -> 1000)

  def getGeneratingFeatureMatrixBuffer: Int = $(generatingFeatureMatrixBuffer)

  val rowPartitionSplitNumOnGeneratingFeatureMatrix: IntParam = new IntParam(this,
    "rowPartitionSplitsNumOnGeneratingFeatureMatrix",
    "row partition splits number on generating features matrix."
  )
  setDefault(rowPartitionSplitNumOnGeneratingFeatureMatrix -> 1)

  def getRowPartitionSplitNumOnGeneratingFeatureMatrix: Int =
    $(rowPartitionSplitNumOnGeneratingFeatureMatrix)

  val compressFeatureMatrix: BooleanParam = new BooleanParam(this,
    "compressFeatureMatrix",
    "compress feature matrix."
  )
  setDefault(compressFeatureMatrix -> false)

  def getCompressFeatureMatrix: Boolean = $(compressFeatureMatrix)
} 
Example 3
Source File: WordLengthFilter.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.WordLengthFilterModel
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params}
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}



  final def getWordLength: Int = $(wordLength)
}

class WordLengthFilter(override val uid: String) extends Transformer
  with WordLengthFilterParams
  with DefaultParamsWritable {

  val defaultLength = 3
  var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3

  def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words"))
  def this() = this(new WordLengthFilterModel)

  def setInputCol(value: String): this.type = set(inputCol, value)
  def setOutputCol(value: String): this.type = set(outputCol, value)
  def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value)

  override def transform(dataset: Dataset[_]): DataFrame = {
    if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength)
    val filterWordsUdf = udf {
      (words: Seq[String]) => model(words)
    }

    dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer =  defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    require(schema($(inputCol)).dataType.isInstanceOf[ArrayType],
      s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}")
    val inputFields = schema.fields

    require(!inputFields.exists(_.name == $(outputCol)),
      s"Output column ${$(outputCol)} already exists.")

    StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true)))

  }
}

object WordLengthFilter extends  DefaultParamsReadable[WordLengthFilter] {
  override def load(path: String): WordLengthFilter = super.load(path)
} 
Example 4
Source File: GaussianProcessParams.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.commons

import org.apache.spark.ml.PredictorParams
import org.apache.spark.ml.commons.kernel.{Kernel, RBFKernel}
import org.apache.spark.ml.param.shared.{HasAggregationDepth, HasMaxIter, HasSeed, HasTol}
import org.apache.spark.ml.param.{DoubleParam, IntParam, Param}

private[ml] trait GaussianProcessParams extends PredictorParams
  with HasMaxIter with HasTol with HasAggregationDepth with HasSeed {

  final val activeSetProvider = new Param[ActiveSetProvider](this, "activeSetProvider",
    "the class which provides the active set used by Projected Process Approximation")

  final val kernel = new Param[() => Kernel](this,
    "kernel", "function of no arguments which returns " +
      "the kernel of the prior Gaussian Process")

  final val datasetSizeForExpert = new IntParam(this,
    "datasetSizeForExpert", "The number of data points fed to each expert. " +
      "Time and space complexity of training quadratically grows with it.")

  final val sigma2 = new DoubleParam(this,
    "sigma2", "The variance of noise in the inputs. The value is added to the diagonal of the " +
      "kernel Matrix. Also prevents numerical issues associated with inversion " +
      "of a computationally singular matrix ")

  final val activeSetSize = new IntParam(this,
    "activeSetSize", "Number of latent functions to project the process onto. " +
      "The size of the produced model and prediction complexity " +
      "linearly depend on this value.")

  def setActiveSetProvider(value : ActiveSetProvider): this.type = set(activeSetProvider, value)
  setDefault(activeSetProvider -> RandomActiveSetProvider)

  def setDatasetSizeForExpert(value: Int): this.type = set(datasetSizeForExpert, value)
  setDefault(datasetSizeForExpert -> 100)

  def setMaxIter(value: Int): this.type = set(maxIter, value)
  setDefault(maxIter -> 100)

  def setSigma2(value: Double): this.type = set(sigma2, value)
  setDefault(sigma2 -> 1e-3)

  def setKernel(value: () => Kernel): this.type = set(kernel, value)
  setDefault(kernel -> (() => new RBFKernel()))

  def setTol(value: Double): this.type = set(tol, value)
  setDefault(tol -> 1E-6)

  def setActiveSetSize(value: Int): this.type = set(activeSetSize, value)
  setDefault(activeSetSize -> 100)

  def setSeed(value: Long): this.type = set(seed, value)
} 
Example 5
Source File: MovingAverage.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types._


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(windowSize -> 3)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataSet.schema)
    val sparkContext = dataSet.sqlContext.sparkContext
    val inputType = outputSchema($(inputCol)).dataType
    val inputTypeBr = sparkContext.broadcast(inputType)
    val dataSetRdd = dataSet.rdd
    val inputColName = sparkContext.broadcast($(inputCol))
    val inputColIndex = dataSet.columns.indexOf($(inputCol))
    val inputColIndexBr = sparkContext.broadcast(inputColIndex)
    val windowSizeBr = sparkContext.broadcast($(windowSize))
    val maRdd = dataSetRdd.map { case (row: Row) =>
      val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) {
        val vector =
          row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value)
        (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1)))
      } else {
        val iterable = row.getAs[Iterable[Double]](inputColName.value)
        (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1)))
      }
      val (before, after) = row.toSeq.splitAt(inputColIndexBr.value)
      Row(
        (before :+ rawValue) ++ after.tail :+ MovingAverageCalc
          .simpleMovingAverageArray(array, windowSizeBr.value): _*
      )
    }
    dataSet.sqlContext.createDataFrame(maRdd, outputSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra)
}

object MovingAverageCalc {
  private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = {
    (for (i <- 1 to values.length)
      yield
      //TODO rollback this comment with the right size of features to make the meanaverage return
      // the features values for the first values of the calc
      if (i < period) 0d //values(i)
      else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d)
  }
}

object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] {

  override def load(path: String): MovingAverage[_] = super.load(path)
} 
Example 6
Source File: Cleaner.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package functions.clean

import com.hankcs.hanlp.HanLP
import config.paramconf.{HasOutputCol, HasInputCol}
import functions.MySchemaUtils
import functions.clean.chinese.BCConvert
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}



  setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1)

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema, logging = true)

    val cleanFunc = udf {line: String =>
      var cleaned = ""
      getFanJian match {
        case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line)
        case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line)
        case _ => cleaned = line
      }

      getQuanBan match {
        case "q2b" => cleaned = BCConvert.qj2bj(cleaned)
        case "b2q" => cleaned = BCConvert.bj2qj(cleaned)
        case _ => cleaned = cleaned
      }

      cleaned
    }

    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record =>
      val outputIndex = record.fieldIndex($(outputCol))
      record.getString(outputIndex).length >= getMinLineLen
    }
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.typeName.equals(StringType.typeName),
      s"Input type must be StringType but got $inputType.")
    MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
  }
}


object Cleaner extends DefaultParamsReadable[Cleaner] {
  override def load(path: String): Cleaner = super.load(path)
} 
Example 7
Source File: HasEmbeddingsProperties.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.embeddings

import com.johnsnowlabs.nlp.AnnotatorType
import org.apache.spark.ml.param.{BooleanParam, IntParam, Params}
import org.apache.spark.sql.Column
import org.apache.spark.sql.types.MetadataBuilder

trait HasEmbeddingsProperties extends Params {

  val dimension = new IntParam(this, "dimension", "Number of embedding dimensions")

  def setDimension(value: Int): this.type = set(this.dimension, value)
  def getDimension: Int = $(dimension)

  protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS)
    metadataBuilder.putLong("dimension", embeddingsDim.toLong)
    embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
    col.as(col.toString, metadataBuilder.build)
  }

  protected def wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", AnnotatorType.SENTENCE_EMBEDDINGS)
    metadataBuilder.putLong("dimension", embeddingsDim.toLong)
    embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
    col.as(col.toString, metadataBuilder.build)
  }

} 
Example 8
Source File: S2CellTransformer.scala    From spark-ext   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import com.google.common.geometry.{S2LatLng, S2CellId}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, StructType}


class S2CellTransformer(override val uid: String) extends Transformer {

  def this() = this(Identifiable.randomUID("S2CellTransformer"))

  // Input/Output column names

  val latCol: Param[String] = new Param[String](this, "latCol", "latitude column")

  val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column")

  val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column")

  val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]",
    (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i))

  // Default parameters

  setDefault(
    latCol  -> "lat",
    lonCol  -> "lon",
    cellCol -> "cell",
    level   -> 10
  )

  def getLatCol: String = $(latCol)

  def getLonCol: String = $(lonCol)

  def getCellCol: String = $(cellCol)

  def getLevel: Int = $(level)

  def setLatCol(value: String): this.type = set(latCol, value)

  def setLonCol(value: String): this.type = set(lonCol, value)

  def setCellCol(value: String): this.type = set(cellCol, value)

  def setLevel(value: Int): this.type = set(level, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val currentLevel = $(level)
    val t = udf { (lat: Double, lon: Double) =>
      val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon))
      cellId.parent(currentLevel).toToken
    }
    val metadata = outputSchema($(cellCol)).metadata
    dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val latColumnName = $(latCol)
    val latDataType = schema(latColumnName).dataType
    require(latDataType == DoubleType,
      s"The latitude column $latColumnName must be Double type, " +
        s"but got $latDataType.")

    val lonColumnName = $(lonCol)
    val lonDataType = schema(lonColumnName).dataType
    require(lonDataType == DoubleType,
      s"The longitude column $lonColumnName must be Double type, " +
        s"but got $lonDataType.")

    val inputFields = schema.fields
    val outputColName = $(cellCol)
    require(inputFields.forall(_.name != outputColName),
      s"Output column $outputColName already exists.")

    val attr = NominalAttribute.defaultAttr.withName($(cellCol))
    val outputFields = inputFields :+ attr.toStructField()
    StructType(outputFields)
  }

  override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra)
} 
Example 9
Source File: HashingTF.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
} 
Example 10
Source File: PercentileCalibrator.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.unary.{UnaryEstimator, UnaryModel}
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.ml.feature.QuantileDiscretizer
import org.apache.spark.ml.param.IntParam
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.MetadataBuilder

import scala.collection.Searching._



class PercentileCalibrator(uid: String = UID[PercentileCalibrator])
  extends UnaryEstimator[RealNN, RealNN](operationName = "percentCalibrator", uid = uid) {

  final val expectedNumBuckets = new IntParam(
    this, "expectedNumBuckets", "number of buckets to divide input data into"
  )
  setDefault(expectedNumBuckets, 100)

  def setExpectedNumBuckets(buckets: Int): this.type = set(expectedNumBuckets, buckets)

  def fitFn(dataset: Dataset[Option[Double]]): UnaryModel[RealNN, RealNN] = {

    val estimator: QuantileDiscretizer = new QuantileDiscretizer()
      .setNumBuckets($(expectedNumBuckets))
      .setRelativeError(0)
      .setInputCol(dataset.columns(0))
      .setOutputCol(dataset.columns(0) + "-out")

    val bucketizerModel = estimator.fit(dataset)

    val model = new PercentileCalibratorModel(
      splits = bucketizerModel.getSplits,
      actualNumBuckets = bucketizerModel.getSplits.length,
      expectedNumBuckets = $(expectedNumBuckets),
      operationName = operationName,
      uid = uid
    )

    val scaledBuckets = bucketizerModel.getSplits.map(v => model.transformFn(v.toRealNN).v.get)

    val meta = new MetadataBuilder()
      .putStringArray(PercentileCalibrator.OrigSplitsKey, bucketizerModel.getSplits.map(_.toString))
      .putStringArray(PercentileCalibrator.ScaledSplitsKey, scaledBuckets.map(_.toString)).build()
    setMetadata(meta.toSummaryMetadata())

    model
  }

}


final class PercentileCalibratorModel private[op]
(
  val splits: Array[Double],
  val actualNumBuckets: Int,
  val expectedNumBuckets: Int,
  operationName: String,
  uid: String
) extends UnaryModel[RealNN, RealNN](operationName = operationName, uid = uid) {

  def transformFn: RealNN => RealNN = (inScalar: RealNN) => {
    val calibrated = splits.search(inScalar.v.get) match {
      case Found(idx) => idx
      case InsertionPoint(idx) => idx
    }
    scale(actualNumBuckets, expectedNumBuckets, calibrated).toRealNN
  }

  private def scale(actualNumBuckets: Int, expectedBuckets: Int, calibrated: Int): Long = {
    if (actualNumBuckets >= expectedBuckets) {
      calibrated - 1 // make it start at zero
    } else {
      val (oldMin, newMin) = (0, 0)
      val (oldMax, newMax) = (Math.max(actualNumBuckets - 2, 0), Math.max(expectedBuckets - 1, 0))
      val oldRange = oldMax - oldMin
      oldRange match {
        case 0 => newMin
        case _ =>
          val newRange = (newMax - newMin).toDouble
          val newValue = (((calibrated - oldMin) * newRange) / oldRange) + newMin
          Math.min(newValue.round, newMax)
      }
    }
  }

}


case object PercentileCalibrator {
  val OrigSplitsKey: String = "origSplits"
  val ScaledSplitsKey: String = "scaledSplits"
} 
Example 11
Source File: HashingTF.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
} 
Example 12
Source File: NGramExtractor.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val lowerBound = $(lowerN)
    val upperBound = $(upperN)
    val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound))
    dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol))))
  }


  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) != $(outputCol)) {
      schema.add($(outputCol), new ArrayType(StringType, true))
    } else {
      schema
    }
  }
}
object NGramExtractor extends DefaultParamsReadable[NGramExtractor] {
  override def load(path: String): NGramExtractor = super.load(path)
} 
Example 13
Source File: HasParallelism.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.param.shared

import scala.concurrent.ExecutionContext

import org.apache.spark.ml.param.{IntParam, Params, ParamValidators}
import org.apache.spark.util.ThreadUtils


  private[ml] def getExecutionContext: ExecutionContext = {
    getParallelism match {
      case 1 =>
        ThreadUtils.sameThread
      case n =>
        ExecutionContext.fromExecutorService(ThreadUtils
          .newDaemonCachedThreadPool(s"${this.getClass.getSimpleName}-thread-pool", n))
    }
  }
} 
Example 14
Source File: SageMakerAlgorithmParams.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.algorithms

import org.apache.spark.ml.param.{IntParam, Param, Params, ParamValidators}


  val featureDim : IntParam = new IntParam(this, "feature_dim",
    "The dimension of the input vectors. Must be > 0.", ParamValidators.gtEq(1))
  def getFeatureDim: Int = $(featureDim)

  protected def autoOrAboveParamValidator(lowerBound: Double,
                                          inclusive: Boolean): String => Boolean = {
    (value: String) =>
      try {
        value == "auto" || {
          if (inclusive) {
            value.toDouble >= lowerBound
          }
          else {
            value.toDouble > lowerBound
          }
        }
      } catch {
        case e: NumberFormatException => false
      }
  }

  protected def inArrayOrAboveParamValidator(validValues: Array[String],
                                             lowerBound: Double): String => Boolean = {
    (value: String) =>
      try {
        validValues.contains(value) || value.toDouble > lowerBound
      } catch {
        case e: NumberFormatException => false
      }
  }

  protected def parseTrueAndFalse(param: Param[String]): Boolean = {
    $(param) match {
      case "True" => true
      case "False" => false
      case _ => throw new IllegalArgumentException("Param is neither 'True' nor 'False'")
    }
  }
} 
Example 15
Source File: HashingTF.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
} 
Example 16
Source File: RankingMetricFormatter.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._
import ws.vinta.albedo.evaluators.RankingEvaluator._

class RankingMetricFormatter(override val uid: String, val sourceType: String)
  extends Transformer with DefaultParamsWritable {

  def this(sourceType: String) = {
    this(Identifiable.randomUID("rankingMetricFormatter"), sourceType)
  }

  val userCol = new Param[String](this, "userCol", "User column name")

  def getUserCol: String = $(userCol)

  def setUserCol(value: String): this.type = set(userCol, value)
  setDefault(userCol -> "user")

  val itemCol = new Param[String](this, "itemCol", "Item column name")

  def getItemCol: String = $(itemCol)

  def setItemCol(value: String): this.type = set(itemCol, value)
  setDefault(itemCol -> "item")

  val predictionCol = new Param[String](this, "predictionCol", "Prediction column name")

  def getPredictionCol: String = $(predictionCol)

  def setPredictionCol(value: String): this.type = set(predictionCol, value)
  setDefault(predictionCol -> "prediction")

  val topK = new IntParam(this, "topK", "Recommend top-k items for every user")

  def getTopK: Int = $(topK)

  def setTopK(value: Int): this.type = set(topK, value)
  setDefault(topK -> 15)

  override def transformSchema(schema: StructType): StructType = {
    Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType)
      .foreach{
        case(columnName: String, expectedDataType: DataType) => {
          val actualDataType = schema(columnName).dataType
          require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.")
        }
      }

    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    sourceType match {
      case "als" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK)))
      case "lr" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK)))
    }
  }

  override def copy(extra: ParamMap): RankingMetricFormatter = {
    val copied = new RankingMetricFormatter(uid, sourceType)
    copyValues(copied, extra)
  }
}

object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]