org.apache.spark.sql.functions.udf Scala Examples

The following examples show how to use org.apache.spark.sql.functions.udf. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: SparkPFASuiteBase.scala From aardpfark with Apache License 2.0

6 votes

package com.ibm.aardpfark.pfa

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.SparkConf
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.scalactic.Equality
import org.scalatest.FunSuite

abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils {

  val sparkTransformer: Transformer
  val input: Array[String]
  val expectedOutput: Array[String]

  val sparkConf =  new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID).
    set("spark.driver.host", "localhost")
  override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate()
  override val reuseContextIfPossible = true

  // Converts column containing a vector to an array
  def withColumnAsArray(df: DataFrame, colName: String) = {
    val vecToArray = udf { v: Vector => v.toArray }
    df.withColumn(colName, vecToArray(df(colName)))
  }

  def withColumnAsArray(df: DataFrame, first: String, others: String*) = {
    val vecToArray = udf { v: Vector => v.toArray }
    var result = df.withColumn(first, vecToArray(df(first)))
    others.foreach(c => result = result.withColumn(c, vecToArray(df(c))))
    result
  }

  // Converts column containing a vector to a sparse vector represented as a map
  def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = {
    val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap }
    df.withColumn(colName, vecToMap(df(colName)))
  }

}

abstract class Result

object ApproxEquality extends ApproxEquality

trait ApproxEquality {

  import org.scalactic.Tolerance._
  import org.scalactic.TripleEquals._

  implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] {
    override def areEqual(a: Seq[Double], b: Any): Boolean = {
      b match {
        case d: Seq[Double] =>
          a.zip(d).forall { case (l, r) => l === r +- 0.001 }
        case _ =>
          false
      }
    }
  }

  implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] {
    override def areEqual(a: Vector, b: Any): Boolean = {
      b match {
        case v: Vector =>
          a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 }
        case _ =>
          false
      }
    }
  }
}

Example 2

Source File: Entropy.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{col, sum, udf}


case class Entropy(column: String, where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    val summands = udf { (count: Double) =>
      if (count == 0.0) {
        0.0
      } else {
        -(count / numRows) * math.log(count / numRows)
      }
    }

    sum(summands(col(COUNT_COL))) :: Nil
  }

  override def filterCondition: Option[String] = where
}

Example 3

Source File: SparkXGBoostClassifierSuite.scala From sparkxgboost with Apache License 2.0

5 votes

package rotationsymmetry.sxgboost

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.functions.udf
import org.scalatest.FunSuite
import rotationsymmetry.sxgboost.loss.LogisticLoss
import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext

class SparkXGBoostClassifierSuite extends FunSuite with TestData with MLlibTestSparkContext {

  test("test with simple data") {
    val rawdata = Seq(
      LabeledPoint(0, Vectors.dense(0.0, 0.0)),
      LabeledPoint(0, Vectors.dense(0.0, 0.0)),
      LabeledPoint(1, Vectors.dense(0.0, 0.0)),

      LabeledPoint(1, Vectors.dense(1.0, 0.0)),
      LabeledPoint(1, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0, Vectors.dense(1.0, 0.0)),

      LabeledPoint(1, Vectors.dense(0.0, 1.0)),
      LabeledPoint(1, Vectors.dense(0.0, 1.0)),
      LabeledPoint(0, Vectors.dense(0.0, 1.0)),

      LabeledPoint(0, Vectors.dense(1.0, 1.0)),
      LabeledPoint(0, Vectors.dense(1.0, 1.0)),
      LabeledPoint(1, Vectors.dense(1.0, 1.0))
    )

    val data = sqlContext.createDataFrame(sc.parallelize(rawdata, 2))

    val truthUDF = udf { feature: Vector =>
      if (feature(0) == feature(1))
        0.0
      else
        1.0
    }

    val dataWithTruth = data.withColumn("truth", truthUDF(data("features")))

    val featureIndexer = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexedFeatures")
      .setMaxCategories(2)
      .fit(data)

    val sparkXGBoostClassifier = new SparkXGBoostClassifier(new LogisticLoss)
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(2)
      .setNumTrees(1)
    val sparkXGBoostPipeline = new Pipeline()
      .setStages(Array(featureIndexer, sparkXGBoostClassifier))
    val sXGBoostModel = sparkXGBoostPipeline.fit(data)

    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("truth")
      .setPredictionCol("prediction")
      .setMetricName("precision")

    val precision = evaluator.evaluate(sXGBoostModel.transform(dataWithTruth))

    assert(precision === 1.0)
  }
}

Example 4

Source File: ExtractTokensUDF.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.udf

import gopkg.in.bblfsh.sdk.v1.uast.generated.Node
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf


case object ExtractTokensUDF extends CustomUDF {

  override val name = "extractTokens"

  override def apply(session: SparkSession): UserDefinedFunction =
    udf[Seq[String], Seq[Array[Byte]]](extractTokens)

  private def extractTokens(nodes: Seq[Array[Byte]]): Seq[String] = {
    timer.time({
      if (nodes == null) {
        Seq()
      } else {
        nodes.map(Node.parseFrom).map(_.token)
      }
    })
  }

}

Example 5

Source File: QueryXPathUDF.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.udf

import gopkg.in.bblfsh.sdk.v1.uast.generated.Node
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
import tech.sourced.engine.util.Bblfsh



case object QueryXPathUDF extends CustomUDF {

  override val name = "queryXPath"

  override def apply(session: SparkSession): UserDefinedFunction = {
    val configB = session.sparkContext.broadcast(Bblfsh.getConfig(session))
    udf[Seq[Array[Byte]], Seq[Array[Byte]], String]((nodes, query) =>
      queryXPath(nodes, query, configB.value))
  }

  private def queryXPath(nodes: Seq[Array[Byte]],
                         query: String,
                         config: Bblfsh.Config): Seq[Array[Byte]] = {
    timer.time({
      if (nodes == null) {
        return null
      }

      nodes.map(Node.parseFrom).flatMap(n => {
        val result = Bblfsh.filter(n, query, config)
        if (result == null) {
          None
        } else {
          result.toIterator
        }
      }).map(_.toByteArray)
    })
  }

}

Example 6

Source File: ExtractUASTsUDF.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.udf

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
import tech.sourced.engine.util.Bblfsh

trait ExtractUASTsUDF {

  def extractUASTs(path: String,
                   content: Array[Byte],
                   lang: String = null,
                   config: Bblfsh.Config): Seq[Array[Byte]] = {
    if (content == null || content.isEmpty) {
      Seq()
    } else {
      Bblfsh.extractUAST(path, content, lang, config)
    }
  }

}


case object ExtractUASTsUDF extends CustomUDF with ExtractUASTsUDF {

  override val name = "extractUASTs"

  override def apply(session: SparkSession): UserDefinedFunction = {
    val configB = session.sparkContext.broadcast(Bblfsh.getConfig(session))
    udf[Seq[Array[Byte]], String, Array[Byte], String]((path, content, lang) =>
      extractUASTs(path, content, lang, configB.value))
  }

}

Example 7

Source File: ClassifyLanguagesUDF.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.udf

import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
import tech.sourced.enry.Enry


  def getLanguage(isBinary: Boolean, path: String, content: Array[Byte]): Option[String] = {
    timer.time({
      if (isBinary) {
        None
      } else {
        val lang = try {
          Enry.getLanguage(path, content)
        } catch {
          case e@(_: RuntimeException | _: Exception) =>
            log.error(s"get language for file '$path' failed", e)
            null
        }
        if (null == lang || lang.isEmpty) None else Some(lang)
      }
    })
  }

}

Example 8

Source File: HashingTF.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

Example 9

Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import org.apache.lucene.analysis.util.StopwordAnalyzerBase
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.HasOutputCol
import org.apache.spark.ml.param.{Param, ParamMap, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}


  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  def this() = this(Identifiable.randomUID("languageAnalyzer"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputColText) equals $(outputCol)) {
      val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText)))
      SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true))
    } else {
      SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true))
    }
  }

}

object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] {
  override def load(path: String): LanguageAwareAnalyzer = super.load(path)
}

Example 10

Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import com.google.common.base.Optional
import com.optimaize.langdetect.LanguageDetector
import com.optimaize.langdetect.i18n.LdLocale
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}

import scala.collection.Map


  def setOutputCol(value: String): this.type = set(outputCol, value)

  def this() = this(Identifiable.randomUID("languageDetector"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), StringType)
  }

  @transient object languageDetectorWrapped extends Serializable {
    val languageDetector: LanguageDetector =
      LanguageDetectorUtils.buildLanguageDetector(
        LanguageDetectorUtils.readListLangsBuiltIn(),
        $(minimalConfidence),
        $(languagePriors).toMap)
  }

}

Example 11

Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val lowerBound = $(lowerN)
    val upperBound = $(upperN)
    val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound))
    dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol))))
  }


  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) != $(outputCol)) {
      schema.add($(outputCol), new ArrayType(StringType, true))
    } else {
      schema
    }
  }
}
object NGramExtractor extends DefaultParamsReadable[NGramExtractor] {
  override def load(path: String): NGramExtractor = super.load(path)
}

Example 12

Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import java.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed}
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{LongType, StructType}


  def setDim(value: Long): this.type = set(dim, value)


  def this() = this(Identifiable.randomUID("randomProjectionsHasher"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val dimensity = {
      if (!isSet(dim)) {//If dimensions is not set - will search  AttributeGroup in metadata as it comes from OdklCountVectorizer
        val vectorsIndex = dataset.schema.fieldIndex($(inputCol))
        AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size
      } else {
        $(dim).toInt
      }
    }
    val projectionMatrix = dataset.sqlContext.sparkContext.broadcast(
      Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix])
  //the matrix of random vectors to costruct hash

    val binHashSparseVectorColumn = udf((vector: Vector) => {
      projectionMatrix.value.multiply(vector).values
        .map(f =>  if (f>0) 1L else 0L)
        .view.zipWithIndex
        .foldLeft(0L) {case  (acc,(v, i)) => acc | (v << i) }

    })
    dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), LongType)
  }

}

Example 13

Source File: URLElimminator.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, Params}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}


  def setInputCol(value: String): this.type = set(inputCol, value)

  def this() = this(Identifiable.randomUID("URLEliminator"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), filterTextUDF(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) != $(outputCol)) {
      schema.add($(outputCol), StringType)
    } else {
      schema
    }
  }
}

object URLElimminator extends DefaultParamsReadable[URLElimminator] {
  override def load(path: String): URLElimminator = super.load(path)
}

Example 14

Source File: MutualInformation.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers._
import com.amazon.deequ.metrics.{DoubleMetric, Entity}
import org.apache.spark.sql.functions.{col, sum, udf}
import org.apache.spark.sql.types.StructType
import Analyzers.COUNT_COL
import com.amazon.deequ.analyzers.runners.MetricCalculationException


  override def preconditions: Seq[StructType => Unit] = {
    Preconditions.exactlyNColumns(columns, 2) +: super.preconditions
  }

  override def toFailureMetric(exception: Exception): DoubleMetric = {
    metricFromFailure(exception, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn)
  }

  override def filterCondition: Option[String] = where
}

object MutualInformation {
  def apply(columnA: String, columnB: String): MutualInformation = {
    new MutualInformation(columnA :: columnB :: Nil)
  }
}

Example 15

Source File: HashingTF.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

Example 16

Source File: HashingTF.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Example 17

Source File: GroupedDatasetSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.api.python.PythonEvalType
import org.apache.spark.sql.catalyst.plans.logical.AnalysisBarrier
import org.apache.spark.sql.execution.python.PythonUDF
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class GroupedDatasetSuite extends QueryTest with SharedSQLContext {
  import testImplicits._

  private val scalaUDF = udf((x: Long) => { x + 1 })
  private lazy val datasetWithUDF = spark.range(1).toDF("s").select($"s", scalaUDF($"s"))

  private def assertContainsAnalysisBarrier(ds: Dataset[_], atLevel: Int = 1): Unit = {
    assert(atLevel >= 0)
    var children = Seq(ds.queryExecution.logical)
    (1 to atLevel).foreach { _ =>
      children = children.flatMap(_.children)
    }
    val barriers = children.collect {
      case ab: AnalysisBarrier => ab
    }
    assert(barriers.nonEmpty, s"Plan does not contain AnalysisBarrier at level $atLevel:\n" +
      ds.queryExecution.logical)
  }

  test("SPARK-24373: avoid running Analyzer rules twice on RelationalGroupedDataset") {
    val groupByDataset = datasetWithUDF.groupBy()
    val rollupDataset = datasetWithUDF.rollup("s")
    val cubeDataset = datasetWithUDF.cube("s")
    val pivotDataset = datasetWithUDF.groupBy().pivot("s", Seq(1, 2))
    datasetWithUDF.cache()
    Seq(groupByDataset, rollupDataset, cubeDataset, pivotDataset).foreach { rgDS =>
      val df = rgDS.count()
      assertContainsAnalysisBarrier(df)
      assertCached(df)
    }

    val flatMapGroupsInRDF = datasetWithUDF.groupBy().flatMapGroupsInR(
      Array.emptyByteArray,
      Array.emptyByteArray,
      Array.empty,
      StructType(Seq(StructField("s", LongType))))
    val flatMapGroupsInPandasDF = datasetWithUDF.groupBy().flatMapGroupsInPandas(PythonUDF(
      "pyUDF",
      null,
      StructType(Seq(StructField("s", LongType))),
      Seq.empty,
      PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
      true))
    Seq(flatMapGroupsInRDF, flatMapGroupsInPandasDF).foreach { df =>
      assertContainsAnalysisBarrier(df, 2)
      assertCached(df)
    }
    datasetWithUDF.unpersist(true)
  }

  test("SPARK-24373: avoid running Analyzer rules twice on KeyValueGroupedDataset") {
    val kvDasaset = datasetWithUDF.groupByKey(_.getLong(0))
    datasetWithUDF.cache()
    val mapValuesKVDataset = kvDasaset.mapValues(_.getLong(0)).reduceGroups(_ + _)
    val keysKVDataset = kvDasaset.keys
    val flatMapGroupsKVDataset = kvDasaset.flatMapGroups((k, _) => Seq(k))
    val aggKVDataset = kvDasaset.count()
    val otherKVDataset = spark.range(1).groupByKey(_ + 1)
    val cogroupKVDataset = kvDasaset.cogroup(otherKVDataset)((k, _, _) => Seq(k))
    Seq((mapValuesKVDataset, 1),
        (keysKVDataset, 2),
        (flatMapGroupsKVDataset, 2),
        (aggKVDataset, 1),
        (cogroupKVDataset, 2)).foreach { case (df, analysisBarrierDepth) =>
      assertContainsAnalysisBarrier(df, analysisBarrierDepth)
      assertCached(df)
    }
    datasetWithUDF.unpersist(true)
  }
}

Example 18

Source File: SampleRowsUniformly.scala From mimir with Apache License 2.0

5 votes

package mimir.algebra.sampling

import org.apache.spark.sql.functions.{ rand, udf }
import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter }
import play.api.libs.json._
import mimir.algebra._

case class SampleRowsUniformly(probability:Double) extends SamplingMode
{
  override def toString = s"WITH PROBABILITY $probability"

  def apply(plan: LogicalPlan, seed: Long): LogicalPlan = 
  {
    // Adapted from Spark's df.stat.sampleBy method
    val r = rand(seed)
    val f = udf { (x: Double) => x < probability }
    Filter(
      f(r).expr,
      plan
    )
  }
  def expressions: Seq[Expression] = Seq()
  def rebuildExpressions(x: Seq[Expression]): SamplingMode = this


  def toJson: JsValue = JsObject(Map[String,JsValue](
    "mode" -> JsString(SampleRowsUniformly.MODE),
    "probability" -> JsNumber(probability)
  ))
}

object SampleRowsUniformly
{
  val MODE = "uniform_probability"

  def parseJson(json:Map[String, JsValue]): Option[SampleRowsUniformly] =
  {
    if(json("mode").as[String].equals(MODE)){
      Some(SampleRowsUniformly(json("probability").as[Double]))
    } else {
      None
    }
  }
}

Example 19

Source File: SampleStratifiedOn.scala From mimir with Apache License 2.0

5 votes

package mimir.algebra.sampling

import org.apache.spark.sql.functions.{rand, udf, col}
import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter }
import play.api.libs.json._
import mimir.algebra._
import mimir.exec.spark.RAToSpark
import mimir.serialization.{ Json => MimirJson }


case class SampleStratifiedOn(column:ID, t:Type, strata:Map[PrimitiveValue,Double]) extends SamplingMode
{
  val sparkStrata = 
    strata.map { case (v, p) => RAToSpark.getNative(v, t) -> p }
          .toMap

  override def toString = s"ON $column WITH STRATA ${strata.map { case (v,p) => s"$v -> $p"}.mkString(" | ")}"


  def apply(plan: LogicalPlan, seed: Long): LogicalPlan =
  {
    // Adapted from Spark's df.stat.sampleBy method
    val c = col(column.id)
    val r = rand(seed)
    val f = udf { (stratum: Any, x: Double) =>
              x < sparkStrata.getOrElse(stratum, 0.0)
            }
    Filter(
      f(c, r).expr,
      plan
    )
  }
  def expressions: Seq[Expression] = Seq(Var(column))
  def rebuildExpressions(x: Seq[Expression]): SamplingMode =
  {
    x(0) match { 
      case Var(newColumn) => SampleStratifiedOn(newColumn, t, strata) 
      case _ => throw new RAException("Internal Error: Rewriting stratification variable with arbitrary expression")
    }
  }

  def toJson: JsValue = JsObject(Map[String,JsValue](
    "mode" -> JsString(SampleStratifiedOn.MODE),
    "column" -> JsString(column.id),
    "type" -> MimirJson.ofType(t),
    "strata" -> JsArray(
      strata
        .toSeq
        .map { case (v, p) => JsObject(Map[String,JsValue](
            "value" -> MimirJson.ofPrimitive(v),
            "probability" -> JsNumber(p)
          ))
        }
    )
  ))
}

object SampleStratifiedOn
{
  val MODE = "stratified_on"

  def parseJson(json:Map[String, JsValue]): Option[SampleStratifiedOn] =
  {
    if(json("mode").as[String].equals(MODE)){
      val t = MimirJson.toType(json("type"))

      Some(SampleStratifiedOn(
        ID(json("column").as[String]),
        t,
        json("strata")
          .as[Seq[Map[String,JsValue]]]
          .map { stratum => 
            MimirJson.toPrimitive(t, stratum("value")) -> 
              stratum("probability").as[Double]
          }
          .toMap
      ))
    } else {
      None
    }
  }
}

Example 20

Source File: HashingTF.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Example 21

Source File: functions.scala From spark-fuzzy-matching with MIT License

5 votes

package com.pb.fuzzy.matching

import org.apache.spark.sql.functions.udf

import com.rockymadden.stringmetric.phonetic.MetaphoneAlgorithm
import com.rockymadden.stringmetric.phonetic.MetaphoneMetric
import com.rockymadden.stringmetric.phonetic.NysiisAlgorithm
import com.rockymadden.stringmetric.phonetic.NysiisMetric
import com.rockymadden.stringmetric.phonetic.RefinedNysiisAlgorithm
import com.rockymadden.stringmetric.phonetic.RefinedNysiisMetric
import com.rockymadden.stringmetric.phonetic.RefinedSoundexAlgorithm
import com.rockymadden.stringmetric.phonetic.RefinedSoundexMetric
import com.rockymadden.stringmetric.phonetic.SoundexAlgorithm
import com.rockymadden.stringmetric.phonetic.SoundexMetric
import com.rockymadden.stringmetric.similarity.DiceSorensenMetric
import com.rockymadden.stringmetric.similarity.HammingMetric
import com.rockymadden.stringmetric.similarity.JaccardMetric
import com.rockymadden.stringmetric.similarity.JaroMetric
import com.rockymadden.stringmetric.similarity.JaroWinklerMetric
import com.rockymadden.stringmetric.similarity.LevenshteinMetric
import com.rockymadden.stringmetric.similarity.NGramMetric
import com.rockymadden.stringmetric.similarity.OverlapMetric
import com.rockymadden.stringmetric.similarity.RatcliffObershelpMetric
import com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric



  def metaphoneFn = udf { (document: String, document1: String) =>
    MetaphoneMetric.compare(document, document1)
  }

  def computeMetaphoneFn = udf { (document: String) =>
    MetaphoneAlgorithm.compute(document)
  }

  def nysiisFn = udf { (document: String, document1: String) =>
    NysiisMetric.compare(document, document1)
  }

  def computeNysiisFn = udf { (document: String) =>
    NysiisAlgorithm.compute(document)
  }

  def refinedNysiisFn = udf { (document: String, document1: String) =>
    RefinedNysiisMetric.compare(document, document1)
  }

  def computeRefinedNysiisFn = udf { (document: String) =>
    RefinedNysiisAlgorithm.compute(document)
  }

  def refinedSoundexFn = udf { (document: String, document1: String) =>
    RefinedSoundexMetric.compare(document, document1)
  }

  def computeRefinedSoundexFn = udf { (document: String) =>
    RefinedSoundexAlgorithm.compute(document)
  }

  def soundexFn = udf { (document: String, document1: String) =>
    SoundexMetric.compare(document, document1)
  }

  def computeSoundexFn = udf { (document: String) =>
    SoundexAlgorithm.compute(document)
  }

   
}

Example 22

Source File: OilPriceFunc.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.geomesa

import java.text.SimpleDateFormat
import java.util.Calendar

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{udf, window, last, col, lag}

object OilPriceFunc {

    // use this if the window function misbehaves due to timezone e.g. BST
    // ./spark-shell --driver-java-options "-Duser.timezone=UTC"
    // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC'

    // define a function to reformat the date field
    def convert(date:String) : String = {
      val df1 = new SimpleDateFormat("dd/MM/yyyy")
      val dt = df1.parse(date)
      val df2 = new SimpleDateFormat("yyyy-MM-dd")
      df2.format(dt)
    }

    // create and save oil price changes
    def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = {

      val oilPriceDF = spark.
        read.
        option("header", "true").
        option("inferSchema", "true").
        csv(inputfile)

      val convertDateUDF = udf { (Date: String) => convert(Date) }

      val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE")))

      // offset to start at beginning of week
      val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days"))

      val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window")

//      windowLastDF.show(20, false)

      val sortedWindow = Window.orderBy("window.start")

      val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow)
      val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol)

//      lagLastColDF.show(20, false)

      val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) =>
        var change = ((last - prevLast) compare 0).signum
        if (change == -1)
          change = 0
        change.toDouble
      }

      val findDateTwoDaysAgoUDF = udf { (date: String) =>
        val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
        val cal = Calendar.getInstance
        cal.setTime(dateFormat.parse(date))
        cal.add(Calendar.DATE, -3)
        dateFormat.format(cal.getTime)
      }

      val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc(
        lagLastColDF("last(PRICE)"),
        lagLastColDF("lastPrev(PRICE)")
      )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end")))

//      oilPriceChangeDF.show(20, false)

      oilPriceChangeDF.select("label", "commonFriday").
        write.
        format("com.databricks.spark.csv").
        option("header", "true").
        //.option("codec", "org.apache.hadoop.io.compress.GzipCodec")
        save(outputfile)
    }
}

Example 23

Source File: functions.scala From Hands-On-Deep-Learning-with-Apache-Spark with MIT License

5 votes

package com.databricks.spark.corenlp

import java.util.Properties

import scala.collection.JavaConverters._

import edu.stanford.nlp.ling.CoreAnnotations
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations
import edu.stanford.nlp.pipeline.{Annotation, CleanXmlAnnotator, StanfordCoreNLP, TokenizerAnnotator}
import edu.stanford.nlp.pipeline.CoreNLPProtos.Sentiment
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations
import edu.stanford.nlp.simple.{Document, Sentence}
import edu.stanford.nlp.util.Quadruple

import org.apache.spark.sql.functions.udf

object functions {
  @transient private var sentimentPipeline: StanfordCoreNLP = _

  private def getOrCreateSentimentPipeline(): StanfordCoreNLP = {
    if (sentimentPipeline == null) {
      val props = new Properties()
      props.setProperty("annotators", "tokenize, ssplit, parse, sentiment")
      sentimentPipeline = new StanfordCoreNLP(props)
    }
    sentimentPipeline
  }

  private case class OpenIE(subject: String, relation: String, target: String, confidence: Double) {
    def this(quadruple: Quadruple[String, String, String, java.lang.Double]) =
      this(quadruple.first, quadruple.second, quadruple.third, quadruple.fourth)
  }

  private case class CorefMention(sentNum: Int, startIndex: Int, mention: String)

  private case class CorefChain(representative: String, mentions: Seq[CorefMention])

  private case class SemanticGraphEdge(
    source: String,
    sourceIndex: Int,
    relation: String,
    target: String,
    targetIndex: Int,
    weight: Double)

  
  def sentiment = udf { sentence: String =>
    val pipeline = getOrCreateSentimentPipeline()
    val annotation = pipeline.process(sentence)
    val tree = annotation.get(classOf[CoreAnnotations.SentencesAnnotation])
      .asScala
      .head
      .get(classOf[SentimentCoreAnnotations.SentimentAnnotatedTree])
    RNNCoreAnnotations.getPredictedClass(tree)
}
}

Example 24

Source File: DatasetUtil.scala From sona with Apache License 2.0

5 votes

package org.apache.spark.util

import org.apache.spark.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata}
import org.apache.spark.sql.{Column, DataFrame, Dataset}


object DatasetUtil {
  def withColumns[T](ds: Dataset[T],
                     colNames: Seq[String],
                     cols: Seq[Column],
                     metadata: Seq[Metadata]): DataFrame = {
    require(colNames.size == cols.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of columns: ${cols.size}")
    require(colNames.size == metadata.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of metadata elements: ${metadata.size}")

    val sparkSession = ds.sparkSession
    val queryExecution = ds.queryExecution
    val resolver = sparkSession.sessionState.analyzer.resolver
    val output = queryExecution.analyzed.output

    checkColumnNameDuplication(colNames,
      "in given column names",
      sparkSession.sessionState.conf.caseSensitiveAnalysis)

    val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) =>
      colName -> col.as(colName, metadata)
    }.toMap

    val replacedAndExistingColumns = output.map { field =>
      columnMap.find { case (colName, _) =>
        resolver(field.name, colName)
      } match {
        case Some((colName: String, col: Column)) => col.as(colName)
        case _ => new Column(field)
      }
    }

    val newColumns = columnMap.filter { case (colName, col) =>
      !output.exists(f => resolver(f.name, colName))
    }.map { case (colName, col) => col.as(colName) }

    ds.select(replacedAndExistingColumns ++ newColumns: _*)
  }

  def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = {
    withColumns(ds, Seq(colName), Seq(col), Seq(metadata))
  }

  private def checkColumnNameDuplication(columnNames: Seq[String], colType: String,
                                         caseSensitiveAnalysis: Boolean): Unit = {
    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
    if (names.distinct.length != names.length) {
      val duplicateColumns = names.groupBy(identity).collect {
        case (x, ys) if ys.length > 1 => s"`$x`"
      }
      throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
    }
  }

  /**
    * Cast a column in a Dataset to Vector type.
    *
    * The supported data types of the input column are
    * - Vector
    * - float/double type Array.
    *
    * Note: The returned column does not have Metadata.
    *
    * @param dataset input DataFrame
    * @param colName column name.
    * @return Vector column
    */
  def columnToVector(dataset: Dataset[_], colName: String): Column = {
    val columnDataType = dataset.schema(colName).dataType
    columnDataType match {
      case _: VectorUDT => col(colName)
      case fdt: ArrayType =>
        val transferUDF = fdt.elementType match {
          case _: FloatType => udf(f = (vector: Seq[Float]) => {
            val inputArray = Array.fill[Double](vector.size)(0.0)
            vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble)
            Vectors.dense(inputArray)
          })
          case _: DoubleType => udf((vector: Seq[Double]) => {
            Vectors.dense(vector.toArray)
          })
          case other =>
            throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector")
        }
        transferUDF(col(colName))
      case other =>
        throw new IllegalArgumentException(s"$other column cannot be cast to Vector")
    }
  }

}

Example 25

Source File: UDFTest.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.udf

import com.github.dnvriend.TestSpec

class UDFTest extends TestSpec {
  it should "uppercase using a user defined function or UDF" in withSparkSession { spark =>
    import spark.implicits._
    // lets create a DataFrame
    val df = Seq((0, "hello"), (1, "world")).toDF("id", "text")

    df.as[(Int, String)].collect() shouldBe Seq(
      (0, "hello"),
      (1, "world")
    )

    // define a plain old Scala Function
    val upper: String => String = _.toUpperCase + "- foo"

    // create a User Defined Function
    import org.apache.spark.sql.functions.udf
    val upperUDF = udf(upper)

    // apply the user defined function
    df.withColumn("upper", upperUDF('text))
      .as[(Int, String, String)].collect shouldBe Seq(
        (0, "hello", "HELLO- foo"),
        (1, "world", "WORLD- foo")
      )

    // the UDF can be used in a query
    // first register a temp view so that
    // we can reference the DataFrame
    df.createOrReplaceTempView("df")

    // register the UDF by name 'upperUDF'
    spark.udf.register("upperUDF", upper)

    // use the UDF in a SQL-Query
    spark.sql("SELECT *, upperUDF(text) FROM df")
      .as[(Int, String, String)].collect shouldBe Seq(
        (0, "hello", "HELLO- foo"),
        (1, "world", "WORLD- foo")
      )
  }
}

Example 26

Source File: DataFrameTfrConverter.scala From ecosystem with Apache License 2.0

5 votes

package org.tensorflow.spark.datasources.tfrecords.udf

import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
import org.tensorflow.spark.datasources.tfrecords.serde.DefaultTfRecordRowEncoder

object DataFrameTfrConverter {
  def getRowToTFRecordExampleUdf: UserDefinedFunction = udf(rowToTFRecordExampleUdf _ )

  private def rowToTFRecordExampleUdf(row: Row): Array[Byte] = {
    DefaultTfRecordRowEncoder.encodeExample(row).toByteArray
  }

  def getRowToTFRecordSequenceExampleUdf: UserDefinedFunction = udf(rowToTFRecordSequenceExampleUdf _ )

  private def rowToTFRecordSequenceExampleUdf(row: Row): Array[Byte] = {
    DefaultTfRecordRowEncoder.encodeSequenceExample(row).toByteArray
  }
}

Example 27

Source File: functions.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp

import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions.{array, col, explode, udf}
import org.apache.spark.sql.types.DataType

import scala.reflect.runtime.universe._

object functions {

  implicit class FilterAnnotations(dataset: DataFrame) {
    def filterByAnnotationsCol(column: String, function: Seq[Annotation] => Boolean): DataFrame = {
      val meta = dataset.schema(column).metadata
      val func = udf {
        annotatorProperties: Seq[Row] =>
          function(annotatorProperties.map(Annotation(_)))
      }
      dataset.filter(func(col(column)).as(column, meta))
    }
  }

  def mapAnnotations[T](function: Seq[Annotation] => T, outputType: DataType): UserDefinedFunction = udf ( {
    annotatorProperties: Seq[Row] =>
      function(annotatorProperties.map(Annotation(_)))
  }, outputType)

  def mapAnnotationsStrict(function: Seq[Annotation] => Seq[Annotation]): UserDefinedFunction = udf {
    annotatorProperties: Seq[Row] =>
      function(annotatorProperties.map(Annotation(_)))
  }

  implicit class MapAnnotations(dataset: DataFrame) {
    def mapAnnotationsCol[T: TypeTag](column: String, outputCol: String, function: Seq[Annotation] => T): DataFrame = {
      val meta = dataset.schema(column).metadata
      val func = udf {
        annotatorProperties: Seq[Row] =>
          function(annotatorProperties.map(Annotation(_)))
      }
      dataset.withColumn(outputCol, func(col(column)).as(outputCol, meta))
    }
  }

  implicit class EachAnnotations(dataset: DataFrame) {

    import dataset.sparkSession.implicits._

    def eachAnnotationsCol[T: TypeTag](column: String, function: Seq[Annotation] => Unit): Unit = {
      dataset.select(column).as[Array[Annotation]].foreach(function(_))
    }
  }

  implicit class ExplodeAnnotations(dataset: DataFrame) {
    def explodeAnnotationsCol[T: TypeTag](column: String, outputCol: String): DataFrame = {
      val meta = dataset.schema(column).metadata
      dataset.
        withColumn(outputCol, explode(col(column))).
        withColumn(outputCol, array(col(outputCol)).as(outputCol, meta))
    }
  }

}

Example 28

Source File: HashingTF.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Example 29

Source File: FilterTopFeaturesProcess.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs.wal.process

import org.apache.s2graph.s2jobs.task.TaskConf
import org.apache.s2graph.s2jobs.wal.WalLogAgg
import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import play.api.libs.json.{JsObject, Json}

object FilterTopFeaturesProcess {
  private var validFeatureHashKeys: Set[Long] = null
  def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = {
    if (validFeatureHashKeys == null) {
      validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet
    }

    validFeatureHashKeys
  }

  def collectDistinctFeatureHashes(ss: SparkSession,
                                   filteredDict: DataFrame): Array[Long] = {
    import ss.implicits._

    val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value))

    filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value")))
      .select("featureHash")
      .distinct().as[Long].collect()
  }

  def filterTopKsPerDim(dict: DataFrame,
                        maxRankPerDim: Broadcast[Map[String, Int]],
                        defaultMaxRank: Int): DataFrame = {
    val filterUDF = udf((dim: String, rank: Long) => {
      rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank)
    })

    dict.filter(filterUDF(col("dim"), col("rank")))
  }

  def filterWalLogAgg(ss: SparkSession,
                      walLogAgg: Dataset[WalLogAgg],
                      transformers: Seq[Transformer],
                      validFeatureHashKeysBCast: Broadcast[Array[Long]]) = {
    import ss.implicits._
    walLogAgg.mapPartitions { iter =>
      val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast)

      iter.map { walLogAgg =>
        WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys)
      }
    }
  }
}

class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) {

  import FilterTopFeaturesProcess._

  
  override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = {
    import ss.implicits._

    val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s =>
      Json.parse(s).as[JsObject].fields.map { case (k, jsValue) =>
        k -> jsValue.as[Int]
      }.toMap
    }
    val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty))

    val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt)

    val featureDict = inputMap(taskConf.options("featureDict"))
    val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg]

    val transformers = TaskConf.parseTransformers(taskConf)

    val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue))
    val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict)
    val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys)

    filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF()
  }

  override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg")
}

Example 30

Source File: Grok.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs.udfs

import org.apache.s2graph.s2jobs.utils.GrokHelper
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DataType, StructType}
import play.api.libs.json.{JsValue, Json}

class Grok extends Udf {
  import org.apache.spark.sql.functions.udf

  def register(ss: SparkSession, name:String, options:Map[String, String]) = {
    // grok
    val patternDir = options.getOrElse("patternDir", "/tmp")
    val patternFiles = options.getOrElse("patternFiles", "").split(",").toSeq
    val patterns = Json.parse(options.getOrElse("patterns", "{}")).asOpt[Map[String, String]].getOrElse(Map.empty)
    val compilePattern = options("compilePattern")
    val schemaOpt = options.get("schema")

    patternFiles.foreach { patternFile =>
      ss.sparkContext.addFile(s"${patternDir}/${patternFile}")
    }

    implicit val grok = GrokHelper.getGrok(name, patternFiles, patterns, compilePattern)

    val f = if(schemaOpt.isDefined) {
      val schema = DataType.fromJson(schemaOpt.get)
      implicit val keys:Array[String] = schema.asInstanceOf[StructType].fieldNames
      udf(GrokHelper.grokMatchWithSchema _, schema)
    } else {
      udf(GrokHelper.grokMatch _)
    }


    ss.udf.register(name, f)
  }
}

Example 31

Source File: FlintTestData.scala From flint with Apache License 2.0

5 votes

package org.apache.spark.sql
import org.apache.spark.sql.functions.{ udf, sum }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.percent_rank

trait FlintTestData {
  protected def sqlContext: SQLContext

  private object internalImplicits extends SQLImplicits {
    override protected def _sqlContext: SQLContext = sqlContext
  }

  import internalImplicits._
  import FlintTestData._

  protected lazy val testData: DataFrame = {
    val df = sqlContext.sparkContext.parallelize(
      (0 to 97).map(i => TestData(i.toLong, i.toDouble))
    ).toDF()
    df
  }

  protected lazy val testData2: DataFrame = {
    val df = sqlContext.sparkContext.parallelize(
      (0 to 101).map(i => TestData2(i.toLong, i.toDouble, -i.toDouble))
    ).toDF()
    df
  }

  protected lazy val testDataCached: DataFrame = {
    val df = DFConverter.newDataFrame(testData)
    df.cache
    df.count
    df
  }

  protected val withTime2Column = { df: DataFrame => df.withColumn("time2", df("time") * 2) }
  protected val withTime3ColumnUdf = { df: DataFrame =>
    val testUdf = udf({ time: Long => time * 2 })
    df.withColumn("time3", testUdf(df("time")))
  }
  protected val selectV = { df: DataFrame => df.select("v") }
  protected val selectExprVPlusOne = { df: DataFrame => df.selectExpr("v + 1 as v") }
  protected val filterV = { df: DataFrame => df.filter(df("v") > 0) }

  protected val orderByTime = { df: DataFrame => df.orderBy("time") }
  protected val orderByV = { df: DataFrame => df.orderBy("v") }
  protected val addRankColumn = { df: DataFrame =>
    df.withColumn("rank", percent_rank().over(Window.partitionBy("time").orderBy("v")))
  }
  protected val selectSumV = { df: DataFrame => df.select(sum("v")) }
  protected val selectExprSumV = { df: DataFrame => df.selectExpr("sum(v)") }
  protected val groupByTimeSumV = { df: DataFrame => df.groupBy("time").agg(sum("v").alias("v")) }
  protected val repartition = { df: DataFrame => df.repartition(10) }
  protected val coalesce = { df: DataFrame => df.coalesce(5) }

  protected val cache = { df: DataFrame => df.cache(); df.count(); df }
  protected val unpersist = { df: DataFrame => df.unpersist() }
}

object FlintTestData {
  case class TestData(time: Long, v: Double)
  case class TestData2(time: Long, v: Double, v2: Double)
}

Example 32

Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector}
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class ReebDiagramTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")
  val cover = new Cover()
    .setExploding(true)
    .setInputCols("double", "integer")
    .setOutputCol("cover_id")

  property("argument topTreeSize must be positive") {
    intercept[IllegalArgumentException] {
      val reeb = new ReebDiagram()
//        .setIdCol("id")
//        .setCoverCol("cover_id")
//        .setFeaturesCol("vector")
//        .setOutputCol("cluster_id")
        .setTopTreeSize(0)
    }
  }

  property("placeholder") {
    val reeb = new ReebDiagram()
      .setK(15)
      .setIdCol("id")
      .setCoverCol("cover_id")
      .setFeaturesCol("vector")
      .setOutputCol("cluster_id")
    forAll(dataframeGen.arbitrary) { df =>
      val assembled = assembler.transform(df)
      whenever(
        assembled.count() > 0 && hasDistinctValues(assembled,
                                                   "double",
                                                   "integer")) {
        val transformed = cover
          .fit(assembled)
          .transform(assembled)
        val result = reeb
          .setTopTreeSize(1)
          .fit(transformed)
          .transform(transformed)
//        result.show()
      }
    }
  }
}

Example 33

Source File: CoverTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class CoverTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")

  property("argument numSplits must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setNumSplits(0)
    }
  }

  property("argument overlapRatio must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setOverlapRatio(0.0)
    }
  }

  property("cover estimator changes nothing with the original dataframe") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        val covered = cover
          .fit(transformed)
          .transform(transformed)
          .drop("cover_ids")
          .except(transformed)
          .count() should be(0)
      }
    }
  }

  property("generated cover covers all range of specified columns") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")
    val uncovered = udf { xs: Seq[Long] =>
      xs.length == 0
    }

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        cover
          .fit(transformed)
          .transform(transformed)
          .where(uncovered(col("cover_ids")))
          .count() should be(0)
      }
    }
  }

  property("Cover is readable/writable") {
    val cover = new Cover()
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    testDefaultReadWrite(cover)
  }

  property("CoverModel is readable/writable") {
    val model = new CoverModel("myCoverModel",
                               Vectors.dense(-1.0, 0.0),
                               Vectors.dense(1.0, 10.0))
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    val newModel = testDefaultReadWrite(model)
    assert(newModel.min === model.min)
    assert(newModel.max === model.max)
  }
}

Example 34

Source File: MathUnary.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
import org.apache.spark.sql.functions.udf


    private val className = classOf[MathUnary].getName

    override def load(path: String): MathUnary = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("operation").head()
      val operation = data.getAs[String](0)

      val model = MathUnaryModel(UnaryOperation.forName(operation))
      val transformer = new MathUnary(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

}

Example 35

Source File: MultinomialLabeler.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.MultinomialLabelerModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasFeaturesCol
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{udf, col}
import ml.combust.mleap.core.util.VectorConverters._


class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"),
                         val model: MultinomialLabelerModel) extends Transformer
  with HasFeaturesCol
  with HasProbabilitiesCol
  with HasLabelsCol {

  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
  def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value)
  def setLabelsCol(value: String): this.type = set(labelsCol, value)

  @org.apache.spark.annotation.Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val probabilitiesUdf = udf {
      (vector: Vector) => model.top(vector).map(_._1).toArray
    }

    val labelsUdf = udf {
      (vector: Vector) => model.topLabels(vector).toArray
    }

    dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))).
      withColumn($(labelsCol), labelsUdf(col($(featuresCol))))
  }

  override def copy(extra: ParamMap): Transformer =
    copyValues(new MultinomialLabeler(uid, model), extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT],
      s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}")
    val inputFields = schema.fields
    require(!inputFields.exists(_.name == $(probabilitiesCol)),
      s"Output column ${$(probabilitiesCol)} already exists.")
    require(!inputFields.exists(_.name == $(labelsCol)),
      s"Output column ${$(labelsCol)} already exists.")

    StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)),
      StructField($(labelsCol), ArrayType(StringType))))
  }
}

Example 36

Source File: PointSuite.scala From magellan with Apache License 2.0

5 votes

package magellan

import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.spark.sql.types._
import org.scalatest.FunSuite

class PointSuite extends FunSuite with TestSparkContext {

  test("bounding box") {
    val point = Point(1.0, 1.0)
    val BoundingBox(xmin, ymin, xmax, ymax) = point.boundingBox
    assert(xmin === 1.0)
    assert(ymin === 1.0)
    assert(xmax === 1.0)
    assert(ymax === 1.0)
  }

  test("serialization") {
    val point = Point(1.0, 1.0)
    val pointUDT = new PointUDT
    val BoundingBox(xmin, ymin, xmax, ymax) = point.boundingBox
    val row = pointUDT.serialize(point)
    assert(row.getInt(0) === point.getType())
    assert(row.getDouble(1) === xmin)
    assert(row.getDouble(2) === ymin)
    assert(row.getDouble(3) === xmax)
    assert(row.getDouble(4) === ymax)
    val serializedPoint = pointUDT.deserialize(row)
    assert(point.equals(serializedPoint))
  }

  test("point udf") {
    val sqlContext = this.sqlContext
    import sqlContext.implicits._
    val points = sc.parallelize(Seq((-1.0, -1.0), (-1.0, 1.0), (1.0, -1.0))).toDF("x", "y")
    import org.apache.spark.sql.functions.udf
    val toPointUDF = udf{(x:Double,y:Double) => Point(x,y) }
    val point = points.withColumn("point", toPointUDF('x, 'y))
      .select('point)
      .first()(0)
      .asInstanceOf[Point]

    assert(point.getX() === -1.0)
    assert(point.getY() === -1.0)
  }

  test("jackson serialization") {
    val s = new ObjectMapper().writeValueAsString(Point(1.0, 1.0))
    assert(s.contains("boundingBox"))
    assert(s.contains("x"))
    assert(s.contains("y"))
  }

  test("within circle") {
    assert(Point(0.0, 0.0) withinCircle (Point(0.5, 0.5), 0.75))
    assert(!(Point(0.0, 0.0) withinCircle (Point(0.5, 0.5), 0.5)))
  }

  test("buffer point") {
    val polygon = Point(0.0, 1.0).buffer(0.5)
    assert(polygon.getNumRings() === 1)
    // check that [0.0, 0.75] is within this polygon
    assert(polygon.contains(Point(0.0, 0.75)))
    // check that [0.4, 1.0] is within this polygon
    assert(polygon.contains(Point(0.4, 1.0)))
    // check that [0.6, 1.0] is outside this polygon
    assert(!polygon.contains(Point(0.6, 1.0)))
  }
}

Example 37

Source File: ShortestPaths.scala From graphframes with Apache License 2.0

5 votes

package org.graphframes.lib

import java.util

import scala.collection.JavaConverters._

import org.apache.spark.graphx.{lib => graphxlib}
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.apache.spark.sql.api.java.UDF1
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{IntegerType, MapType}

import org.graphframes.GraphFrame


  def landmarks(value: util.ArrayList[Any]): this.type = {
    landmarks(value.asScala)
  }

  def run(): DataFrame = {
    ShortestPaths.run(graph, check(lmarks, "landmarks"))
  }
}

private object ShortestPaths {

  private def run(graph: GraphFrame, landmarks: Seq[Any]): DataFrame = {
    val idType = graph.vertices.schema(GraphFrame.ID).dataType
    val longIdToLandmark = landmarks.map(l => GraphXConversions.integralId(graph, l) -> l).toMap
    val gx = graphxlib.ShortestPaths.run(
      graph.cachedTopologyGraphX,
      longIdToLandmark.keys.toSeq.sorted).mapVertices { case (_, m) => m.toSeq }
    val g = GraphXConversions.fromGraphX(graph, gx, vertexNames = Seq(DISTANCE_ID))
    val distanceCol: Column = if (graph.hasIntegralIdType) {
      // It seems there are no easy way to convert a sequence of pairs into a map
      val mapToLandmark = udf { distances: Seq[Row] =>
        distances.map { case Row(k: Long, v: Int) =>
          k -> v
        }.toMap
      }
      mapToLandmark(g.vertices(DISTANCE_ID))
    } else {
      val func = new UDF1[Seq[Row], Map[Any, Int]] {
        override def call(t1: Seq[Row]): Map[Any, Int] = {
          t1.map { case Row(k: Long, v: Int) =>
              longIdToLandmark(k) -> v
          }.toMap
        }
      }
      val mapToLandmark = udf(func, MapType(idType, IntegerType, false))
      mapToLandmark(col(DISTANCE_ID))
    }
    val cols = graph.vertices.columns.map(col) :+ distanceCol.as(DISTANCE_ID)
    g.vertices.select(cols: _*)
  }

  private val DISTANCE_ID = "distances"

}

Example 38

Source File: HashingTF.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Example 39

Source File: Cleaner.scala From CkoocNLP with Apache License 2.0

5 votes

package functions.clean

import com.hankcs.hanlp.HanLP
import config.paramconf.{HasOutputCol, HasInputCol}
import functions.MySchemaUtils
import functions.clean.chinese.BCConvert
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}



  setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1)

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema, logging = true)

    val cleanFunc = udf {line: String =>
      var cleaned = ""
      getFanJian match {
        case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line)
        case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line)
        case _ => cleaned = line
      }

      getQuanBan match {
        case "q2b" => cleaned = BCConvert.qj2bj(cleaned)
        case "b2q" => cleaned = BCConvert.bj2qj(cleaned)
        case _ => cleaned = cleaned
      }

      cleaned
    }

    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record =>
      val outputIndex = record.fieldIndex($(outputCol))
      record.getString(outputIndex).length >= getMinLineLen
    }
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.typeName.equals(StringType.typeName),
      s"Input type must be StringType but got $inputType.")
    MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
  }
}


object Cleaner extends DefaultParamsReadable[Cleaner] {
  override def load(path: String): Cleaner = super.load(path)
}

Example 40

Source File: HigherOrderKind.scala From cleanframes with Apache License 2.0

5 votes

package cleanframes.instances

import cleanframes.Cleaner
import org.apache.spark.sql.functions.udf

import scala.reflect.runtime.universe.TypeTag

trait HigherOrderKind {

  implicit def stringHigherOrder[A, B[_]](implicit
                                          aTag: TypeTag[A],
                                          bTag: TypeTag[B[A]],
                                          func: String => B[A]): Cleaner[B[A]] =
    Cleaner.materialize { (frame, name, alias) =>
      List(
        udf(func).apply(frame.col(name.get)) as alias.get
      )
    }
}

Example 41

Source File: ServingUDFs.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package org.apache.spark.sql.execution.streaming

import com.microsoft.ml.spark.io.http.HTTPResponseData
import com.microsoft.ml.spark.io.http.HTTPSchema.{binary_to_response, empty_response, string_to_response}
import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{lit, struct, to_json, udf}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, Row}

import scala.util.Try

object ServingUDFs {

  private def jsonReply(c: Column) = string_to_response(to_json(c))

  def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = {
    dt match {
      case NullType => empty_response(code, reason)
      case StringType => string_to_response(data, code, reason)
      case BinaryType => binary_to_response(data)
      case _: StructType => jsonReply(data)
      case _: MapType => jsonReply(data)
      case at: ArrayType => at.elementType match {
        case _: StructType => jsonReply(data)
        case _: MapType => jsonReply(data)
        case _ => jsonReply(struct(data))
      }
      case _ => jsonReply(struct(data))
    }
  }

  private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = {
    if (Option(reply).isEmpty || Option(id).isEmpty) {
      null.asInstanceOf[Boolean] //scalastyle:ignore null
    } else {
      Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply)))
        .toOption.isDefined
    }
  }

  def sendReplyUDF: UserDefinedFunction = {
    val toData = HTTPResponseData.makeFromRowConverter
    udf(sendReplyHelper(toData) _, BooleanType)
  }

}

Example 42

Source File: PageSplitter.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.featurize.text

import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable}
import org.apache.spark.ml._
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}

object PageSplitter extends DefaultParamsReadable[PageSplitter]


class PageSplitter(override val uid: String)
  extends Transformer with HasInputCol with HasOutputCol with Wrappable with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("PageSplitter"))

  setDefault(outputCol, uid + "_output")

  val maximumPageLength =
    new IntParam(this, "maximumPageLength",
      "the maximum number of characters to be in a page")

  def setMaximumPageLength(v: Int): this.type = set(maximumPageLength, v)

  def getMaximumPageLength: Int = $(maximumPageLength)

  val minimumPageLength =
    new IntParam(this, "minimumPageLength",
      "the the minimum number of characters " +
        "to have on a page in order to preserve work boundaries")

  def setMinimumPageLength(v: Int): this.type = set(minimumPageLength, v)

  def getMinimumPageLength: Int = $(minimumPageLength)

  val boundaryRegex = new Param[String](this, "boundaryRegex", "how to split into words")

  def setBoundaryRegex(v: String): this.type = set(boundaryRegex, v)

  def getBoundaryRegex: String = $(boundaryRegex)

  setDefault(maximumPageLength -> 5000, minimumPageLength -> 4500, boundaryRegex -> "\\s")

  def split(textOpt: String): Seq[String] = {
    Option(textOpt).map { text =>
      if (text.length < getMaximumPageLength) {
        Seq(text)
      } else {
        val lengths = text
          .split(getBoundaryRegex)
          .map(_.length)
          .flatMap(l => List(l, 1))
          .dropRight(1)

        val indicies = lengths.scanLeft((0, 0, Nil: List[Int])) { case ((total, count, _), l) =>
          if (count + l < getMaximumPageLength) {
            (total + l, count + l, Nil)
          } else if (count > getMinimumPageLength) {
            (total + l, l, List(total))
          } else {
            val firstPageChars = getMaximumPageLength - count
            val firstPage = firstPageChars + total
            val remainingChars = l - firstPageChars

            val numPages = remainingChars / getMaximumPageLength
            val remainder = remainingChars - getMaximumPageLength * numPages
            val pages = List(firstPage) ::: (1 to numPages).map(i =>
              total + firstPageChars + getMaximumPageLength * i).toList
            (total + l, remainder, pages)
          }
        }.flatMap(_._3)

        val words = (List(0) ::: indicies.toList ::: List(text.length))
          .sliding(2)
          .map { case List(start, end) => text.substring(start, end) }
          .toSeq
        words
      }
    }.orNull
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.toDF().withColumn(getOutputCol, udf(split _, ArrayType(StringType))(col(getInputCol)))
  }

  override def copy(extra: ParamMap): MultiNGram =
    defaultCopy(extra)

  def transformSchema(schema: StructType): StructType = {
    assert(schema(getInputCol).dataType == StringType)
    schema.add(getOutputCol, ArrayType(StringType))
  }

}

Example 43

Source File: Lambda.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.contracts.Wrappable
import org.apache.spark.SparkContext
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.{ParamMap, UDFParam}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object Lambda extends ComplexParamsReadable[Lambda] {
  def apply(f: Dataset[_] => DataFrame): Lambda = {
    new Lambda().setTransform(f)
  }
}

class Lambda(val uid: String) extends Transformer with Wrappable with ComplexParamsWritable {
  def this() = this(Identifiable.randomUID("Lambda"))

  val transformFunc = new UDFParam(this, "transformFunc", "holder for dataframe function")

  def setTransform(f: Dataset[_] => DataFrame): this.type = {
    set(transformFunc, udf(f, StringType))
  }

  def getTransform: Dataset[_] => DataFrame = {
    $(transformFunc).f.asInstanceOf[Dataset[_] => DataFrame]
  }

  val transformSchemaFunc = new UDFParam(this, "transformSchemaFunc", "the output schema after the transformation")

  def setTransformSchema(f: StructType => StructType): this.type = {
    set(transformSchemaFunc, udf(f, StringType))
  }

  def getTransformSchema: StructType => StructType = {
    $(transformSchemaFunc).f.asInstanceOf[StructType => StructType]
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    getTransform(dataset)
  }

  def transformSchema(schema: StructType): StructType = {
    if (get(transformSchemaFunc).isEmpty) {
      val sc = SparkContext.getOrCreate()
      val df = SparkSession.builder().getOrCreate().createDataFrame(sc.emptyRDD[Row], schema)
      transform(df).schema
    } else {
      getTransformSchema(schema)
    }
  }

  def copy(extra: ParamMap): Lambda = defaultCopy(extra)

}

Example 44

Source File: udfs.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.Column
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.DoubleType

import scala.collection.mutable

//scalastyle:off
object udfs {

  def get_value_at(colName: String, i: Int): Column = {
    udf({
      vec: org.apache.spark.ml.linalg.Vector => vec(i)
    }, DoubleType)(col(colName))
  }

  val to_vector: UserDefinedFunction = udf({
    arr: Seq[Double] => Vectors.dense(arr.toArray)
  }, VectorType)

  def to_vector(colName: String): Column = to_vector(col(colName))

}

Example 45

Source File: VowpalWabbitClassifier.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.env.InternalWrapper
import com.microsoft.ml.spark.core.schema.DatasetExtensions
import org.apache.spark.ml.ComplexParamsReadable
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.ml.classification.{ProbabilisticClassificationModel, ProbabilisticClassifier}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql._
import org.apache.spark.sql.functions.{col, udf}
import org.vowpalwabbit.spark.VowpalWabbitExample
import com.microsoft.ml.spark.core.schema.DatasetExtensions._

import scala.math.exp

object VowpalWabbitClassifier extends DefaultParamsReadable[VowpalWabbitClassifier]

@InternalWrapper
class VowpalWabbitClassifier(override val uid: String)
  extends ProbabilisticClassifier[Row, VowpalWabbitClassifier, VowpalWabbitClassificationModel]
  with VowpalWabbitBase
{
  def this() = this(Identifiable.randomUID("VowpalWabbitClassifier"))

  // to support Grid search we need to replicate the parameters here...
  val labelConversion = new BooleanParam(this, "labelConversion",
    "Convert 0/1 Spark ML style labels to -1/1 VW style labels. Defaults to true.")
  setDefault(labelConversion -> true)
  def getLabelConversion: Boolean = $(labelConversion)
  def setLabelConversion(value: Boolean): this.type = set(labelConversion, value)

  override protected def train(dataset: Dataset[_]): VowpalWabbitClassificationModel = {
    val model = new VowpalWabbitClassificationModel(uid)
      .setFeaturesCol(getFeaturesCol)
      .setAdditionalFeatures(getAdditionalFeatures)
      .setPredictionCol(getPredictionCol)
      .setProbabilityCol(getProbabilityCol)
      .setRawPredictionCol(getRawPredictionCol)

    val finalDataset = if (!getLabelConversion)
      dataset
    else {
      val inputLabelCol = dataset.withDerivativeCol("label")
      dataset
        .withColumnRenamed(getLabelCol, inputLabelCol)
        .withColumn(getLabelCol, col(inputLabelCol) * 2 - 1)
    }

    trainInternal(finalDataset, model)
  }

  override def copy(extra: ParamMap): VowpalWabbitClassifier = defaultCopy(extra)
}

// Preparation for multi-class learning, though it no fun as numClasses is spread around multiple reductions
@InternalWrapper
class VowpalWabbitClassificationModel(override val uid: String)
  extends ProbabilisticClassificationModel[Row, VowpalWabbitClassificationModel]
    with VowpalWabbitBaseModel {

  def numClasses: Int = 2

  override def transform(dataset: Dataset[_]): DataFrame = {
    val df = transformImplInternal(dataset)

    // which mode one wants to use depends a bit on how this should be deployed
    // 1. if you stay in spark w/o link=logistic is probably more convenient as it also returns the raw prediction
    // 2. if you want to export the model *and* get probabilities at scoring term w/ link=logistic is preferable

    // convert raw prediction to probability (if needed)
    val probabilityUdf = if (vwArgs.getArgs.contains("--link logistic"))
      udf { (pred: Double) => Vectors.dense(Array(1 - pred, pred)) }
    else
      udf { (pred: Double) => {
        val prob = 1.0 / (1.0 + exp(-pred))
        Vectors.dense(Array(1 - prob, prob))
      } }

    val df2 = df.withColumn($(probabilityCol), probabilityUdf(col($(rawPredictionCol))))

    // convert probability to prediction
    val probability2predictionUdf = udf(probability2prediction _)
    df2.withColumn($(predictionCol), probability2predictionUdf(col($(probabilityCol))))
  }

  override def copy(extra: ParamMap): this.type = defaultCopy(extra)

  protected override def predictRaw(features: Row): Vector = {
    throw new NotImplementedError("Not implemented")
  }

  protected override def raw2probabilityInPlace(rawPrediction: Vector): Vector= {
    throw new NotImplementedError("Not implemented")
  }
}

object VowpalWabbitClassificationModel extends ComplexParamsReadable[VowpalWabbitClassificationModel]

Example 46

Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable}
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.functions.{col, struct, udf}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType

object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions]


class VowpalWabbitInteractions(override val uid: String) extends Transformer
  with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable
{
  def this() = this(Identifiable.randomUID("VowpalWabbitInteractions"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val fieldSubset = dataset.schema.fields
      .filter(f => getInputCols.contains(f.name))

    val mask = getMask

    val mode = udf((r: Row) => {

      // compute the final number of features
      val numElems = (0 until r.length)
        .map(r.getAs[Vector](_).numNonzeros).product

      val newIndices = new Array[Int](numElems)
      val newValues = new Array[Double](numElems)

      // build interaction features using FNV-1
      val fnvPrime = 16777619
      var i = 0

      def interact(idx: Int, value: Double, ns: Int): Unit = {
        if (ns == r.size) {
          newIndices(i) += mask & idx
          newValues(i) += value

          i += 1
        }
        else {
          val idx1 = idx * fnvPrime

          r.getAs[Vector](ns).foreachActive { case (idx2, value2) =>
            interact(idx1 ^ idx2, value * value2, ns + 1)
          }
        }
      }

      // start the recursion
      interact(0, 1, 0)

      val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions)

      Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted)
    })

    dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*)))
  }

  override def transformSchema(schema: StructType): StructType = {
    val fieldNames = schema.fields.map(_.name)
    for (f <- getInputCols)
      if (!fieldNames.contains(f))
        throw new IllegalArgumentException("missing input column " + f)
      else {
        val fieldType = schema.fields(schema.fieldIndex(f)).dataType

        if (fieldType != VectorType)
          throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName)
      }

    schema.add(StructField(getOutputCol, VectorType, true))
  }

  override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra)
}

Example 47

Source File: HTTPTransformer.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.http

import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable}
import com.microsoft.ml.spark.io.http.HandlingUtils.HandlerFunc
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row}

import scala.concurrent.ExecutionContext
import scala.concurrent.duration.Duration

trait HasHandler extends Params {
  val handler: UDFParam = new UDFParam(
    this, "handler", "Which strategy to use when handling requests")

  
  override def transform(dataset: Dataset[_]): DataFrame = {
    val df = dataset.toDF()
    val enc = RowEncoder(transformSchema(df.schema))
    val colIndex = df.schema.fieldNames.indexOf(getInputCol)
    val fromRow = HTTPRequestData.makeFromRowConverter
    val toRow = HTTPResponseData.makeToRowConverter
    df.mapPartitions { it =>
      if (!it.hasNext) {
        Iterator()
      }else{
        val c = clientHolder.get
        val responsesWithContext = c.sendRequestsWithContext(it.map{row =>
          c.RequestWithContext(Option(row.getStruct(colIndex)).map(fromRow), Some(row))
        })
        responsesWithContext.map { rwc =>
          Row.merge(rwc.context.get.asInstanceOf[Row], Row(rwc.response.flatMap(Option(_)).map(toRow).orNull))
        }
      }
    }(enc)
  }

  def copy(extra: ParamMap): HTTPTransformer = defaultCopy(extra)

  def transformSchema(schema: StructType): StructType = {
    assert(schema(getInputCol).dataType == HTTPSchema.Request)
    schema.add(getOutputCol, HTTPSchema.Response, nullable=true)
  }

}

Example 48

Source File: SparkBindingsTest.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.schema

import com.microsoft.ml.spark.core.test.base.TestBase
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.{col, udf}

case class Foo(a: Int, b: String, c: Seq[Bar])

object Foo extends SparkBindings[Foo]

case class Bar(a: Int, c: Seq[Byte])

object Bar extends SparkBindings[Bar]

class SparkBindingsTest2 extends TestBase {

  import session.implicits._

  test("Test to make sure there are no strange memory leaks") {
    (1 to 40).foreach { i =>
      val foos = (0 to 40).map(i => Tuple1(Foo(i, i.toString, Seq(Bar(i, "foo".getBytes)))))
      val converter = Foo.makeFromRowConverter
      val df = foos.toDF("foos")
        .repartition(2)
        .withColumn("mapped2",
          udf({ r: Row => converter(r) }, Foo.schema)(col("foos")))
      val results = df.collect().toList
      println(results.head)
    }
  }

}

Example 49

Source File: ParserSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split1

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.io.http._
import org.apache.http.client.methods.HttpPost
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession}

trait ParserUtils extends WithServer {

  def sampleDf(spark: SparkSession): DataFrame = {
    val df = spark.createDataFrame((1 to 10).map(Tuple1(_)))
      .toDF("data")
    val df2 = new JSONInputParser().setInputCol("data")
      .setOutputCol("parsedInput").setUrl(url)
      .transform(df)
      .withColumn("unparsedOutput", udf({ x: Int =>
        HTTPResponseData(
          Array(),
          Some(EntityData(
            "{\"foo\": \"here\"}".getBytes, None, None, None, false, false, false)),
          StatusLineData(ProtocolVersionData("foo", 1, 1), 200, "bar"),
          "en")
      }).apply(col("data"))
      )

    new JSONOutputParser()
      .setDataType(new StructType().add("foo", StringType))
      .setInputCol("unparsedOutput")
      .setOutputCol("parsedOutput")
      .transform(df2)
  }

  def makeTestObject[T <: Transformer](t: T, session: SparkSession): Seq[TestObject[T]] = {
    Seq(new TestObject(t, sampleDf(session)))
  }

}

class JsonInputParserSuite extends TransformerFuzzing[JSONInputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[JSONInputParser]] = makeTestObject(
    new JSONInputParser().setInputCol("data").setOutputCol("out")
      .setUrl(url), session)

  override def reader: MLReadable[_] = JSONInputParser
}

class JsonOutputParserSuite extends TransformerFuzzing[JSONOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[JSONOutputParser]] = makeTestObject(
    new JSONOutputParser().setInputCol("unparsedOutput").setOutputCol("out")
      .setDataType(new StructType().add("foo", StringType)), session)

  override def reader: MLReadable[_] = JSONOutputParser
}

class StringOutputParserSuite extends TransformerFuzzing[StringOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[StringOutputParser]] = makeTestObject(
    new StringOutputParser().setInputCol("unparsedOutput").setOutputCol("out"), session)

  override def reader: MLReadable[_] = StringOutputParser
}

class CustomInputParserSuite extends TransformerFuzzing[CustomInputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[CustomInputParser]] = makeTestObject(
    new CustomInputParser().setInputCol("data").setOutputCol("out")
      .setUDF({ x: Int => new HttpPost(s"http://$x") }), session)

  override def reader: MLReadable[_] = CustomInputParser
}

class CustomOutputParserSuite extends TransformerFuzzing[CustomOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[CustomOutputParser]] = makeTestObject(
    new CustomOutputParser().setInputCol("unparsedOutput").setOutputCol("out")
      .setUDF({ x: HTTPResponseData => x.locale }), session)

  override def reader: MLReadable[_] = CustomOutputParser
}

Example 50

Source File: HashingTF.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Example 51

Source File: SchemaColumnSelection.scala From data-faker with MIT License

5 votes

package com.dunnhumby.datafaker.schema.table.columns

import scala.reflect.runtime.universe.TypeTag
import java.sql.{Date, Timestamp}
import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{rand, udf}

case class SchemaColumnSelection[T](override val name: String, values: List[T])(implicit tag: TypeTag[T]) extends SchemaColumn {
  override def column(rowID: Option[Column] = None): Column = {
    val intToSelectionUDF = udf((index: Int) => {
      values(index)
    })

    intToSelectionUDF(rand() * values.length % values.length)
  }
}

object SchemaColumnSelectionProtocol extends SchemaColumnSelectionProtocol
trait SchemaColumnSelectionProtocol extends YamlParserProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnSelectionFormat extends YamlFormat[SchemaColumnSelection[_]] {

    override def read(yaml: YamlValue): SchemaColumnSelection[_] = {
      val fields = yaml.asYamlObject.fields
      val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError("data_type not set"))
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val values = fields.getOrElse(YamlString("values"), deserializationError("selection values not set"))

      dataType match {
        case SchemaColumnDataType.Int => SchemaColumnSelection(name, values.convertTo[List[Int]])
        case SchemaColumnDataType.Long => SchemaColumnSelection(name, values.convertTo[List[Long]])
        case SchemaColumnDataType.Float => SchemaColumnSelection(name, values.convertTo[List[Float]])
        case SchemaColumnDataType.Double => SchemaColumnSelection(name, values.convertTo[List[Double]])
        case SchemaColumnDataType.Date => SchemaColumnSelection(name, values.convertTo[List[Date]])
        case SchemaColumnDataType.Timestamp => SchemaColumnSelection(name, values.convertTo[List[Timestamp]])
        case SchemaColumnDataType.String => SchemaColumnSelection(name, values.convertTo[List[String]])
        case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Selection}")
      }

    }

    override def write(obj: SchemaColumnSelection[_]): YamlValue = ???

  }

}