org.apache.spark.sql.expressions.Window Scala Examples

The following examples show how to use org.apache.spark.sql.expressions.Window. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: HivePlanTest.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
} 
Example 2
Source File: FlintTestData.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql
import org.apache.spark.sql.functions.{ udf, sum }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.percent_rank

trait FlintTestData {
  protected def sqlContext: SQLContext

  private object internalImplicits extends SQLImplicits {
    override protected def _sqlContext: SQLContext = sqlContext
  }

  import internalImplicits._
  import FlintTestData._

  protected lazy val testData: DataFrame = {
    val df = sqlContext.sparkContext.parallelize(
      (0 to 97).map(i => TestData(i.toLong, i.toDouble))
    ).toDF()
    df
  }

  protected lazy val testData2: DataFrame = {
    val df = sqlContext.sparkContext.parallelize(
      (0 to 101).map(i => TestData2(i.toLong, i.toDouble, -i.toDouble))
    ).toDF()
    df
  }

  protected lazy val testDataCached: DataFrame = {
    val df = DFConverter.newDataFrame(testData)
    df.cache
    df.count
    df
  }

  protected val withTime2Column = { df: DataFrame => df.withColumn("time2", df("time") * 2) }
  protected val withTime3ColumnUdf = { df: DataFrame =>
    val testUdf = udf({ time: Long => time * 2 })
    df.withColumn("time3", testUdf(df("time")))
  }
  protected val selectV = { df: DataFrame => df.select("v") }
  protected val selectExprVPlusOne = { df: DataFrame => df.selectExpr("v + 1 as v") }
  protected val filterV = { df: DataFrame => df.filter(df("v") > 0) }

  protected val orderByTime = { df: DataFrame => df.orderBy("time") }
  protected val orderByV = { df: DataFrame => df.orderBy("v") }
  protected val addRankColumn = { df: DataFrame =>
    df.withColumn("rank", percent_rank().over(Window.partitionBy("time").orderBy("v")))
  }
  protected val selectSumV = { df: DataFrame => df.select(sum("v")) }
  protected val selectExprSumV = { df: DataFrame => df.selectExpr("sum(v)") }
  protected val groupByTimeSumV = { df: DataFrame => df.groupBy("time").agg(sum("v").alias("v")) }
  protected val repartition = { df: DataFrame => df.repartition(10) }
  protected val coalesce = { df: DataFrame => df.coalesce(5) }

  protected val cache = { df: DataFrame => df.cache(); df.count(); df }
  protected val unpersist = { df: DataFrame => df.unpersist() }
}

object FlintTestData {
  case class TestData(time: Long, v: Double)
  case class TestData2(time: Long, v: Double, v2: Double)
} 
Example 3
Source File: HivePlanTest.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
} 
Example 4
Source File: HivePlanTest.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
} 
Example 5
Source File: DeltaLoad.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.algo

import com.adidas.analytics.algo.DeltaLoad._
import com.adidas.analytics.algo.core.Algorithm
import com.adidas.analytics.algo.shared.DateComponentDerivation
import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration
import com.adidas.analytics.util.DataFrameUtils._
import com.adidas.analytics.util._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.slf4j.{Logger, LoggerFactory}


  private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = {
    // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records)
    val partitionWindow = Window
      .partitionBy(businessKey.map(col): _*)
      .orderBy(technicalKey.map(component => col(component).desc): _*)

    // Ranking & projection
    val rankedDeltaRecords = deltaRecords
      .withColumn(rankingColumnName, row_number().over(partitionWindow))
      .filter(upsertRecordsModesFilterFunction)

    rankedDeltaRecords
      .filter(rankedDeltaRecords(rankingColumnName) === 1)
      .selectExpr(resultColumns: _*)
  }

  protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = {
    logger.info("Adding partitioning information if needed")
    try {
      dataFrames.map { df =>
        if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){
          df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions))
        }
        else df
      }
    } catch {
      case e: Throwable =>
        logger.error("Cannot add partitioning information for data frames.", e)
        //TODO: Handle failure case properly
        throw new RuntimeException("Unable to transform data frames.", e)
    }
  }
}


object DeltaLoad {

  private val logger: Logger = LoggerFactory.getLogger(getClass)

  def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = {
    new DeltaLoad(spark, dfs, configLocation)
  }
} 
Example 6
Source File: HivePlanTest.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
} 
Example 7
Source File: HivePlanTest.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHive

class HivePlanTest extends QueryTest {
  import TestHive._
  import TestHive.implicits._
  //自定义函数常量折叠
  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t")
    val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }
  //共享相同分区的窗口表达式和order by子句
  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
} 
Example 8
Source File: HivePlanTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
} 
Example 9
Source File: MimirSparkRuntimeUtils.scala    From mimir   with Apache License 2.0 5 votes vote down vote up
package mimir.exec.spark

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{ DataType, LongType }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{
  spark_partition_id,
  monotonically_increasing_id,
  count,
  sum,
  first,
  lit,
  col
}

object MimirSparkRuntimeUtils
{
  def zipWithIndex(df: DataFrame, offset: Long = 1, indexName: String = "ROWIDX", indexType:DataType = LongType): DataFrame = {
    val dfWithPartitionId = df.withColumn("partition_id", spark_partition_id()).withColumn("inc_id", monotonically_increasing_id())

    val partitionOffsets = dfWithPartitionId
        .groupBy("partition_id")
        .agg(count(lit(1)) as "cnt", first("inc_id") as "inc_id")
        .orderBy("partition_id")
        .select(col("partition_id"), sum("cnt").over(Window.orderBy("partition_id")) - col("cnt") - col("inc_id") + lit(offset) as "cnt" )
        .collect()
        .map(row => (row.getInt(0), row.getLong(1)))
        .toMap

     val theUdf = org.apache.spark.sql.functions.udf(
       (partitionId: Int) => partitionOffsets(partitionId), 
       LongType
     )
     
     dfWithPartitionId
        .withColumn("partition_offset", theUdf(col("partition_id")))
        .withColumn(indexName, (col("partition_offset") + col("inc_id")).cast(indexType))
        .drop("partition_id", "partition_offset", "inc_id")
  }

  def writeDataSink(dataframe:DataFrame, format:String, options:Map[String, String], save:Option[String]) = {
    val dsFormat = dataframe.write.format(format) 
    val dsOptions = options.toSeq.foldLeft(dsFormat)( (ds, opt) => opt._1 match { 
      case "mode" => ds.mode(opt._2) 
      case _ => ds.option(opt._1, opt._2)
      })
    save match {
      case None => dsOptions.save
      case Some(outputFile) => {
        if(format.equals("com.github.potix2.spark.google.spreadsheets")){
          val gsldfparts = outputFile.split("\\/") 
          val gsldf = s"${gsldfparts(gsldfparts.length-2)}/${gsldfparts(gsldfparts.length-1)}"
          dsOptions.save(gsldf)
        }
        else{
          dsOptions.save(outputFile)
        }
      }
    }
  }
} 
Example 10
Source File: HivePlanTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import hiveContext.sql
  import hiveContext.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t")
    val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
} 
Example 11
Source File: OilPriceFunc.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.geomesa

import java.text.SimpleDateFormat
import java.util.Calendar

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{udf, window, last, col, lag}

object OilPriceFunc {

    // use this if the window function misbehaves due to timezone e.g. BST
    // ./spark-shell --driver-java-options "-Duser.timezone=UTC"
    // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC'

    // define a function to reformat the date field
    def convert(date:String) : String = {
      val df1 = new SimpleDateFormat("dd/MM/yyyy")
      val dt = df1.parse(date)
      val df2 = new SimpleDateFormat("yyyy-MM-dd")
      df2.format(dt)
    }

    // create and save oil price changes
    def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = {

      val oilPriceDF = spark.
        read.
        option("header", "true").
        option("inferSchema", "true").
        csv(inputfile)

      val convertDateUDF = udf { (Date: String) => convert(Date) }

      val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE")))

      // offset to start at beginning of week
      val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days"))

      val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window")

//      windowLastDF.show(20, false)

      val sortedWindow = Window.orderBy("window.start")

      val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow)
      val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol)

//      lagLastColDF.show(20, false)

      val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) =>
        var change = ((last - prevLast) compare 0).signum
        if (change == -1)
          change = 0
        change.toDouble
      }

      val findDateTwoDaysAgoUDF = udf { (date: String) =>
        val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
        val cal = Calendar.getInstance
        cal.setTime(dateFormat.parse(date))
        cal.add(Calendar.DATE, -3)
        dateFormat.format(cal.getTime)
      }

      val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc(
        lagLastColDF("last(PRICE)"),
        lagLastColDF("lastPrev(PRICE)")
      )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end")))

//      oilPriceChangeDF.show(20, false)

      oilPriceChangeDF.select("label", "commonFriday").
        write.
        format("com.databricks.spark.csv").
        option("header", "true").
        //.option("codec", "org.apache.hadoop.io.compress.GzipCodec")
        save(outputfile)
    }
}