org.apache.spark.sql.functions.when Scala Examples

The following examples show how to use org.apache.spark.sql.functions.when. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: AnyValInstances.scala    From cleanframes   with Apache License 2.0 5 votes vote down vote up
package cleanframes.instances

import cleanframes.Cleaner
import org.apache.spark.sql.functions.{lower, trim, when, lit}
import org.apache.spark.sql.types._

trait AnyValInstances
  extends IntInstances
    with ByteInstances
    with CharInstances
    with ShortInstances
    with LongInstances
    with FloatInstances
    with DoubleInstances
    with BooleanInstances
    with NumericAnyValInstance

trait IntInstances {
  implicit lazy val integerType: SparkDataType[Int] = new SparkDataType[Int] {
    override def getDataType: DataType = IntegerType
  }
}

trait ByteInstances {
  implicit lazy val byteType: SparkDataType[Byte] = new SparkDataType[Byte] {
    override def getDataType: DataType = ByteType
  }
}

trait CharInstances {
  implicit val stdStringToChar: String => Char = _.charAt(0)
}

trait ShortInstances {
  implicit lazy val shortType: SparkDataType[Short] = new SparkDataType[Short] {
    override def getDataType: DataType = ShortType
  }
}

trait LongInstances {
  implicit lazy val longType: SparkDataType[Long] = new SparkDataType[Long] {
    override def getDataType: DataType = LongType
  }
}

trait FloatInstances {
  implicit lazy val floatType: SparkDataType[Float] = new SparkDataType[Float] {
    override def getDataType: DataType = FloatType
  }
}

trait DoubleInstances {
  implicit lazy val doubleType: SparkDataType[Double] = new SparkDataType[Double] {
    override def getDataType: DataType = DoubleType
  }
}

trait BooleanInstances {
  implicit lazy val booleanCleaner: Cleaner[Option[Boolean]] = {
    Cleaner.materialize { (frame, name, alias) =>
      List(
        when(
          trim(lower(frame.col(name.get))) === "true",
          lit(true) cast BooleanType
        ).otherwise(false) as alias.get
      )
    }
  }
} 
Example 2
Source File: package.scala    From amadou   with Apache License 2.0 5 votes vote down vote up
package com.mediative

import org.apache.spark.sql._

package object amadou {
  type Config  = com.typesafe.config.Config
  type Gauge   = io.prometheus.client.Gauge
  type Counter = io.prometheus.client.Counter

  implicit class SparkHdfsUrlReaderOps(val self: DataFrameReader) extends AnyVal {
    def csv(url: HdfsUrl*)      = self.csv(url.map(_.toString): _*)
    def json(url: HdfsUrl*)     = self.json(url.map(_.toString): _*)
    def load(url: HdfsUrl*)     = self.load(url.map(_.toString): _*)
    def orc(url: HdfsUrl*)      = self.orc(url.map(_.toString): _*)
    def parquet(url: HdfsUrl*)  = self.parquet(url.map(_.toString): _*)
    def text(url: HdfsUrl*)     = self.text(url.map(_.toString): _*)
    def textFile(url: HdfsUrl*) = self.textFile(url.map(_.toString): _*)
  }

  implicit class SparkHdfsUrlWriteOps[T](val self: DataFrameWriter[T]) extends AnyVal {
    def csv(url: HdfsUrl)     = self.csv(url.toString)
    def json(url: HdfsUrl)    = self.json(url.toString)
    def save(url: HdfsUrl)    = self.save(url.toString)
    def orc(url: HdfsUrl)     = self.orc(url.toString)
    def parquet(url: HdfsUrl) = self.parquet(url.toString)
    def text(url: HdfsUrl)    = self.text(url.toString)
  }

  implicit class SymbolToStage(val self: Symbol) extends AnyVal {
    def stage[I, T](f: Stage.Context[I] => T)                      = Stage(self.name)(f)
    def source[T](read: Stage.Context[SparkSession] => Dataset[T]) = Stage.source(self.name)(read)
    def transform[S, T](transform: Stage.Context[Dataset[S]] => Dataset[T]) =
      Stage.transform(self.name)(transform)
    def sink[T](write: Stage.Context[Dataset[T]] => Unit) = Stage.sink(self.name)(write)
  }

  
    def nullify: Column =
      when(self === "null", null).otherwise(self)
  }
} 
Example 3
Source File: TriangleCount.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{array, col, explode, when}

import org.graphframes.GraphFrame
import org.graphframes.GraphFrame.{DST, ID, LONG_DST, LONG_SRC, SRC}


class TriangleCount private[graphframes] (private val graph: GraphFrame) extends Arguments {

  def run(): DataFrame = {
    TriangleCount.run(graph)
  }
}

private object TriangleCount {

  private def run(graph: GraphFrame): DataFrame = {
    // Dedup edges by flipping them to have LONG_SRC < LONG_DST
    // TODO (when we drop support for Spark 1.4): Use functions greatest, smallest instead of UDFs
    val dedupedE = graph.indexedEdges
      .filter(s"$LONG_SRC != $LONG_DST")
      .selectExpr(
        s"if($LONG_SRC < $LONG_DST, $SRC, $DST) as $SRC",
        s"if($LONG_SRC < $LONG_DST, $DST, $SRC) as $DST")
      .dropDuplicates(Seq(SRC, DST))
    val g2 = GraphFrame(graph.vertices, dedupedE)

    // Because SRC < DST, there exists only one type of triangles:
    // - Non-cycle with one edge flipped.  These are counted 1 time each by motif finding.
    val triangles = g2.find("(a)-[]->(b); (b)-[]->(c); (a)-[]->(c)")

    val triangleCounts = triangles
      .select(explode(array(col("a.id"), col("b.id"), col("c.id"))).as(ID))
      .groupBy(ID)
      .count()

    val v = graph.vertices
    val countsCol = when(col("count").isNull, 0L).otherwise(col("count"))
    val newV = v.join(triangleCounts, v(ID) === triangleCounts(ID), "left_outer")
      .select(countsCol.as(COUNT_ID) +: v.columns.map(v.apply) :_ *)
    newV
  }

  private val COUNT_ID = "count"
} 
Example 4
Source File: PythonColumnTransformationExample.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperations.examples

import org.apache.spark.sql.functions.when

import ai.deepsense.deeplang.{DOperable, ExecutionContext}
import ai.deepsense.deeplang.doperables.{PythonColumnTransformer, TargetTypeChoices}
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice
import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice
import ai.deepsense.deeplang.doperations.PythonColumnTransformation
import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection

class PythonColumnTransformationExample
  extends AbstractOperationExample[PythonColumnTransformation] {

  val inputColumnName = "Weight"
  val outputColumnName = "WeightCutoff"

  // This is mocked because Python executor is not available in tests.
  class PythonColumnTransformationMock extends PythonColumnTransformation {
    override def execute(arg: DataFrame)(context: ExecutionContext): (DataFrame, PythonColumnTransformer) = {
      val sdf = arg.sparkDataFrame
      val resultSparkDataFrame = sdf.select(
        sdf("*"),
        when(sdf(inputColumnName) > 2.0, 2.0).otherwise(sdf(inputColumnName))
          .alias(outputColumnName))
      (DataFrame.fromSparkDataFrame(resultSparkDataFrame), mock[PythonColumnTransformer])
    }
  }

  override def dOperation: PythonColumnTransformation = {
    val op = new PythonColumnTransformationMock()

    val inPlace = NoInPlaceChoice()
      .setOutputColumn(s"$outputColumnName")
    val single = SingleColumnChoice()
      .setInputColumn(NameSingleColumnSelection(inputColumnName))
      .setInPlace(inPlace)
    op.transformer
      .setTargetType(TargetTypeChoices.DoubleTargetTypeChoice())
      .setSingleOrMultiChoice(single)
      .setCodeParameter(
        "def transform_value(value, column_name):\n" +
          "    return min(value, 2.0)")
    op.set(op.transformer.extractParamMap())
  }

  override def fileNames: Seq[String] = Seq("example_animals")
} 
Example 5
Source File: PythonColumnTransformationExample.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperations.examples

import org.apache.spark.sql.functions.when

import io.deepsense.deeplang.{DOperable, ExecutionContext}
import io.deepsense.deeplang.doperables.{PythonColumnTransformer, TargetTypeChoices}
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice
import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice
import io.deepsense.deeplang.doperations.PythonColumnTransformation
import io.deepsense.deeplang.params.selections.NameSingleColumnSelection

class PythonColumnTransformationExample
  extends AbstractOperationExample[PythonColumnTransformation] {

  val inputColumnName = "Weight"
  val outputColumnName = "WeightCutoff"

  // This is mocked because Python executor is not available in tests.
  class PythonColumnTransformationMock extends PythonColumnTransformation {
    override def execute(arg: DataFrame)(context: ExecutionContext): (DataFrame, PythonColumnTransformer) = {
      val sdf = arg.sparkDataFrame
      val resultSparkDataFrame = sdf.select(
        sdf("*"),
        when(sdf(inputColumnName) > 2.0, 2.0).otherwise(sdf(inputColumnName))
          .alias(outputColumnName))
      (DataFrame.fromSparkDataFrame(resultSparkDataFrame), mock[PythonColumnTransformer])
    }
  }

  override def dOperation: PythonColumnTransformation = {
    val op = new PythonColumnTransformationMock()

    val inPlace = NoInPlaceChoice()
      .setOutputColumn(s"$outputColumnName")
    val single = SingleColumnChoice()
      .setInputColumn(NameSingleColumnSelection(inputColumnName))
      .setInPlace(inPlace)
    op.transformer
      .setTargetType(TargetTypeChoices.DoubleTargetTypeChoice())
      .setSingleOrMultiChoice(single)
      .setCodeParameter(
        "def transform_value(value, column_name):\n" +
          "    return min(value, 2.0)")
    op.set(op.transformer.extractParamMap())
  }

  override def fileNames: Seq[String] = Seq("example_animals")
}