The following examples show how to use org.apache.spark.sql.functions.lit.
Example 1
Source File: TestUtils.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.utils

import org.apache.spark.sql.functions.{col, count, lit}
import org.apache.spark.sql.{DataFrame, Row}

object TestUtils {

  implicit class ExtendedDataFrame(df: DataFrame) {

    def hasDiff(anotherDf: DataFrame): Boolean = {
      def printDiff(incoming: Boolean)(row: Row): Unit = {
        if (incoming) print("+ ") else print("- ")

      val groupedDf = df.groupBy( _*).agg(count(lit(1))).collect().toSet
      val groupedAnotherDf = anotherDf.groupBy( _*).agg(count(lit(1))).collect().toSet

      groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true))
      groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false))

      groupedDf.diff(groupedAnotherDf).nonEmpty || groupedAnotherDf.diff(groupedDf).nonEmpty
Example 2
Source File: Uniqueness.scala    From deequ   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{col, lit, sum}
import org.apache.spark.sql.types.DoubleType

case class Uniqueness(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil

  override def filterCondition: Option[String] = where

object Uniqueness {
  def apply(column: String): Uniqueness = {
    new Uniqueness(column :: Nil)

  def apply(column: String, where: Option[String]): Uniqueness = {
    new Uniqueness(column :: Nil, where)
Example 3
Source File: UniqueValueRatio.scala    From deequ   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.{col, count, lit, sum}
import org.apache.spark.sql.types.DoubleType

case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil

  override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
    val numUniqueValues = result.getDouble(offset)
    val numDistinctValues = result.getLong(offset + 1).toDouble

    toSuccessMetric(numUniqueValues / numDistinctValues)

  override def filterCondition: Option[String] = where

object UniqueValueRatio {
  def apply(column: String): UniqueValueRatio = {
    new UniqueValueRatio(column :: Nil)

  def apply(column: String, where: Option[String]): UniqueValueRatio = {
    new UniqueValueRatio(column :: Nil, where)
Example 4
Source File: EnrichPostprocessor.scala    From DataQuality   with GNU Lesser General Public License v3.0 5 votes vote down vote up
package it.agilelab.bigdata.DataQuality.postprocessors

import java.util

import com.typesafe.config.Config
import it.agilelab.bigdata.DataQuality.checks.CheckResult
import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException
import it.agilelab.bigdata.DataQuality.metrics.MetricResult
import it.agilelab.bigdata.DataQuality.sources.HdfsFile
import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig
import it.agilelab.bigdata.DataQuality.utils
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import{HdfsReader, HdfsWriter}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.collection.JavaConversions._
import scala.util.Try

final class EnrichPostprocessor(config: Config, settings: DQSettings)
    extends BasicPostprocessor(config, settings) {

  private val vs: Option[String] = Try(config.getString("source")).toOption
  private val metrics: util.List[String] = config.getStringList("metrics")
  private val checks: util.List[String] = config.getStringList("checks")
  private val extra = config.getObject("extra").toMap

  private val target: HdfsTargetConfig = {
    val conf = config.getConfig("saveTo")

  override def process(vsRef: Set[HdfsFile],
                       metRes: Seq[MetricResult],
                       chkRes: Seq[CheckResult])(
      implicit fs: FileSystem,
      sqlContext: SQLContext,
      settings: DQSettings): HdfsFile = {

    import sqlContext.implicits._

    val df: DataFrame = vs match {
      case Some(vsource) =>
        val reqVS: HdfsFile = vsRef.filter(vr => == vsource).head
        HdfsReader.load(reqVS, settings.ref_date).head
      case None =>

    val reqMet: Seq[(String, Double)] = metRes
      .filter(mr => metrics.contains(mr.metricId))
      .map(mr => mr.metricId -> mr.result)
    val reqCheck: Seq[(String, String)] = chkRes
      .filter(cr => checks.contains(cr.checkId))
      .map(cr => cr.checkId -> cr.status)

    if (reqMet.size != metrics.size())
      throw IllegalParameterException("Some of stated metrics are missing!")
    if (reqCheck.size != checks.size())
      throw IllegalParameterException("Some of stated checks are missing!")

    val dfWithMet: DataFrame =
      reqMet.foldLeft(df)((df, met) => df.withColumn(met._1, lit(met._2)))
    val dfWithChecks = reqCheck.foldLeft(dfWithMet)((df, met) =>
      df.withColumn(met._1, lit(met._2)))
    val dfWithExtra = extra.foldLeft(dfWithChecks)((df, ex) =>
      df.withColumn(ex._1, lit(ex._2.unwrapped())))

      settings.refDateString)(fs, sqlContext.sparkContext)

    new HdfsFile(target)
Example 5
Source File: SchemaColumnFixed.scala    From data-faker   with MIT License 5 votes vote down vote up
package com.dunnhumby.datafaker.schema.table.columns

import java.sql.{Date, Timestamp}
import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.lit

case class SchemaColumnFixed[T](override val name: String, value: T) extends SchemaColumn {
  override def column(rowID: Option[Column] = None): Column = lit(value)

object SchemaColumnFixedProtocol extends SchemaColumnFixedProtocol
trait SchemaColumnFixedProtocol extends YamlParserProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnFixedFormat extends YamlFormat[SchemaColumnFixed[_]] {

    override def read(yaml: YamlValue): SchemaColumnFixed[_] = {
      val fields = yaml.asYamlObject.fields
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name"))
      val value = fields.getOrElse(YamlString("value"), deserializationError(s"value not set for $name"))

      dataType match {
        case SchemaColumnDataType.Int => SchemaColumnFixed(name, value.convertTo[Int])
        case SchemaColumnDataType.Long => SchemaColumnFixed(name, value.convertTo[Long])
        case SchemaColumnDataType.Float => SchemaColumnFixed(name, value.convertTo[Float])
        case SchemaColumnDataType.Double => SchemaColumnFixed(name, value.convertTo[Double])
        case SchemaColumnDataType.Date => SchemaColumnFixed(name, value.convertTo[Date])
        case SchemaColumnDataType.Timestamp => SchemaColumnFixed(name, value.convertTo[Timestamp])
        case SchemaColumnDataType.String => SchemaColumnFixed(name, value.convertTo[String])
        case SchemaColumnDataType.Boolean => SchemaColumnFixed(name, value.convertTo[Boolean])
        case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Fixed}")


    override def write(obj: SchemaColumnFixed[_]): YamlValue = ???


Example 6
Source File: SimpleJsonIngestionJob.scala    From comet-data-pipeline   with Apache License 2.0 5 votes vote down vote up
package com.ebiznext.comet.job.ingest

import com.ebiznext.comet.config.Settings
import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler}
import com.ebiznext.comet.schema.model._
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{DataFrame, Encoders}

import scala.util.{Failure, Success, Try}

class SimpleJsonIngestionJob(
  domain: Domain,
  schema: Schema,
  types: List[Type],
  path: List[Path],
  storageHandler: StorageHandler,
  schemaHandler: SchemaHandler
)(implicit settings: Settings)
    extends DsvIngestionJob(domain, schema, types, path, storageHandler, schemaHandler) {

  override def loadDataSet(): Try[DataFrame] = {
    try {

      val df =
        if (metadata.isArray()) {
          val jsonRDD =

              //  Spark cannot detect the input file automatically, so we should add it explicitly
              if (settings.comet.grouped) lit(","))
              else lit(path.head.toString)

        } else {

            .option("encoding", metadata.getEncoding())
            .option("multiline", metadata.getMultiline())
            .json( _*)
              //  Spark here can detect the input file automatically, so we're just using the input_file_name spark function

      import session.implicits._
      val resDF = if (df.columns.contains("_corrupt_record")) {
        //TODO send rejected records to rejected area
        logger.whenDebugEnabled {
          df.filter($"_corrupt_record".isNotNull).show(1000, false)
        throw new Exception(
          s"""Invalid JSON File: ${path
            .mkString(",")}. SIMPLE_JSON require a valid json file """
      } else {
    } catch {
      case e: Exception =>
Example 7
Source File: PowerBiSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.



import org.apache.spark.SparkException
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.functions.{current_timestamp, lit}

import scala.collection.JavaConverters._

class PowerBiSuite extends TestBase with FileReaderUtils {

  lazy val url: String = sys.env.getOrElse("MML_POWERBI_URL", Secrets.PowerbiURL)
  lazy val df: DataFrame = session
      (Some(0), "a"),
      (Some(1), "b"),
      (Some(2), "c"),
      (Some(3), ""),
      (None, "bad_row")))
    .toDF("bar", "foo")
    .withColumn("baz", current_timestamp())
  lazy val bigdf: DataFrame = (1 to 5).foldRight(df) { case (_, ldf) => ldf.union(df) }.repartition(2)
  lazy val delayDF: DataFrame = {
    val rows = Array.fill(100){df.collect()}.flatten.toList.asJava
    val df2 = session
      .createDataFrame(rows, df.schema)
    df2.count(){x => Thread.sleep(10); x})(RowEncoder(df2.schema))

  test("write to powerBi", TestBase.BuildServer) {
    PowerBIWriter.write(df, url)

  test("write to powerBi with delays"){
    PowerBIWriter.write(delayDF, url)

  test("using dynamic minibatching"){
    PowerBIWriter.write(delayDF, url, Map("minibatcher"->"dynamic", "maxBatchSize"->"50"))

  test("using timed minibatching"){
    PowerBIWriter.write(delayDF, url, Map("minibatcher"->"timed"))

  test("using consolidated timed minibatching"){
    PowerBIWriter.write(delayDF, url, Map(

  test("using buffered batching"){
    PowerBIWriter.write(delayDF, url, Map("buffered"->"true"))

  ignore("throw useful error message when given an improper dataset") {
    //TODO figure out why this does not throw errors on the build machine
    assertThrows[SparkException] {
      PowerBIWriter.write(df.withColumn("bad", lit("foo")), url)

  test("stream to powerBi", TestBase.BuildServer) {
    bigdf.write.parquet(tmpDir + File.separator + "powerBI.parquet")
    val sdf = session.readStream.schema(df.schema).parquet(tmpDir + File.separator + "powerBI.parquet")
    val q1 =, url).start()

Example 8
Source File: ServingUDFs.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package org.apache.spark.sql.execution.streaming

import{binary_to_response, empty_response, string_to_response}
import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{lit, struct, to_json, udf}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, Row}

import scala.util.Try

object ServingUDFs {

  private def jsonReply(c: Column) = string_to_response(to_json(c))

  def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = {
    dt match {
      case NullType => empty_response(code, reason)
      case StringType => string_to_response(data, code, reason)
      case BinaryType => binary_to_response(data)
      case _: StructType => jsonReply(data)
      case _: MapType => jsonReply(data)
      case at: ArrayType => at.elementType match {
        case _: StructType => jsonReply(data)
        case _: MapType => jsonReply(data)
        case _ => jsonReply(struct(data))
      case _ => jsonReply(struct(data))

  private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = {
    if (Option(reply).isEmpty || Option(id).isEmpty) {
      null.asInstanceOf[Boolean] //scalastyle:ignore null
    } else {
      Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply)))

  def sendReplyUDF: UserDefinedFunction = {
    val toData = HTTPResponseData.makeFromRowConverter
    udf(sendReplyHelper(toData) _, BooleanType)

Example 9
Source File: DeltaSourceSnapshot.scala    From delta   with Apache License 2.0 5 votes vote down vote up

import{DeltaLog, DeltaTableUtils, Snapshot}

import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.functions.lit

class DeltaSourceSnapshot(
    val spark: SparkSession,
    val snapshot: Snapshot,
    val filters: Seq[Expression])
  extends SnapshotIterator
  with StateCache {

  protected val version = snapshot.version
  protected val path = snapshot.path

  protected lazy val (partitionFilters, dataFilters) = {
    val partitionCols = snapshot.metadata.partitionColumns
    filters.partition { e =>
      DeltaTableUtils.isPredicatePartitionColumnsOnly(e, partitionCols, spark)

  protected def initialFiles: Dataset[IndexedFile] = {
    import spark.implicits._

      snapshot.allFiles.sort("modificationTime", "path")
        .toDF("add", "index")
        .withColumn("version", lit(version))
        .withColumn("isLast", lit(false))
      s"Delta Source Snapshot #$version - ${snapshot.redactedPath}").getDS

  override def close(unpersistSnapshot: Boolean): Unit = {

    if (unpersistSnapshot) {

trait SnapshotIterator {
  self: DeltaSourceSnapshot =>

  private var result: Iterable[IndexedFile] = _

  def iterator(): Iterator[IndexedFile] = {
    import spark.implicits._
    if (result == null) {
      result = DeltaLog.filterFileList(
    // This will always start from the beginning and re-use resources. If any exceptions were to
    // be thrown, the stream would stop, we would call stop on the source, and that will make
    // sure that we clean up resources.

  def close(unpersistSnapshot: Boolean): Unit = { }
Example 10
Source File: GpuDSArrayMult.scala    From GPUEnabler   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkEnv
import org.apache.spark.sql.functions.lit

object GpuDSArrayMult {

  case class jsonData(name : String, factor: Long, arr: Array[Long])
  case class inputData(name : String, factor: Long, arr: Array[Long], result: Array[Long])
  case class outputData(name: String, result: Array[Long])

  def main(args : Array[String]): Unit = {

    val ss = org.apache.spark.sql.SparkSession.builder.master("local[*]").appName("test").getOrCreate()
    import ss.implicits._
    if(args.length > 0) {
      println("Setting debug Mode" + args(0))
      SparkEnv.get.conf.set("DebugMode", args(0))

    val ptxURL = "/GpuEnablerExamples.ptx"

    // 1. Sample Map Operation - multiple every element in the array by 2
    val mulFunc = DSCUDAFunction("multiplyBy2", Seq("value"), Seq("value"), ptxURL)

    val N: Long = 100000

    val dataPts = ss.range(1, N+1, 1, 10).cache
    val results = dataPts.mapExtFunc(_ * 2, mulFunc).collect()
    println("Count is " + results.length)
    assert(results.length == N)

    val expResults = (1 to N.toInt).map(_ * 2)

    // 2. Sample Reduce Operation - Sum of all elements in the array
    val dimensions = (size: Long, stage: Int) => stage match {
      case 0 => (64, 256, 1, 1, 1, 1)
      case 1 => (1, 1, 1, 1, 1, 1)

    val gpuParams = gpuParameters(dimensions)

    val sumFunc = DSCUDAFunction(
      Some((size: Long) => 2),
      Some(gpuParams), outputSize=Some(1))

    val results2 = dataPts
          .mapExtFunc(_ * 2, mulFunc)
          .reduceExtFunc(_ + _, sumFunc)

    println("Output is "+ results2)
    println("Expected is " + (N * (N + 1)))
    assert(results2 == N * (N + 1))

    // 3. Dataset - GPU Map - Dataset Operation.
    val ds ="src/main/resources/data.json").as[jsonData]

    val dds = ds.withColumn("result", lit(null: Array[Double] )).as[inputData]
    val dsFunc = DSCUDAFunction("arrayTest", Seq("factor", "arr"), Seq("result"), ptxURL)

    val mapDS = dds.mapExtFunc(x => outputData(, x.result),
      Array((1 to 10).map(_ * 3).toArray, (1 to 35).map(_.toLong).toArray),
      outputArraySizes = Array(3))$"name", $"result").show()

Example 11
Source File: AnyValInstances.scala    From cleanframes   with Apache License 2.0 5 votes vote down vote up
package cleanframes.instances

import cleanframes.Cleaner
import org.apache.spark.sql.functions.{lower, trim, when, lit}
import org.apache.spark.sql.types._

trait AnyValInstances
  extends IntInstances
    with ByteInstances
    with CharInstances
    with ShortInstances
    with LongInstances
    with FloatInstances
    with DoubleInstances
    with BooleanInstances
    with NumericAnyValInstance

trait IntInstances {
  implicit lazy val integerType: SparkDataType[Int] = new SparkDataType[Int] {
    override def getDataType: DataType = IntegerType

trait ByteInstances {
  implicit lazy val byteType: SparkDataType[Byte] = new SparkDataType[Byte] {
    override def getDataType: DataType = ByteType

trait CharInstances {
  implicit val stdStringToChar: String => Char = _.charAt(0)

trait ShortInstances {
  implicit lazy val shortType: SparkDataType[Short] = new SparkDataType[Short] {
    override def getDataType: DataType = ShortType

trait LongInstances {
  implicit lazy val longType: SparkDataType[Long] = new SparkDataType[Long] {
    override def getDataType: DataType = LongType

trait FloatInstances {
  implicit lazy val floatType: SparkDataType[Float] = new SparkDataType[Float] {
    override def getDataType: DataType = FloatType

trait DoubleInstances {
  implicit lazy val doubleType: SparkDataType[Double] = new SparkDataType[Double] {
    override def getDataType: DataType = DoubleType

trait BooleanInstances {
  implicit lazy val booleanCleaner: Cleaner[Option[Boolean]] = {
    Cleaner.materialize { (frame, name, alias) =>
          trim(lower(frame.col(name.get))) === "true",
          lit(true) cast BooleanType
        ).otherwise(false) as alias.get
Example 12
Source File: IUberdataForecastUtil.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package eleflow.uberdata

import eleflow.uberdata.core.IUberdataContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions.lit

object IUberdataForecastUtil {

  lazy val FEATURES_PREDICTION_COL_NAME = "featuresPrediction"
  lazy val FEATURES_COL_NAME = "features"
  lazy val ALGORITHM = "algorithm"
  lazy val PARAMS = "parameters"
  lazy val METRIC_COL_NAME = "metric"

  def convertColumnToLong(row: Row, columnIndex: Int): Row = {
    row.get(columnIndex) match {
      case s: java.sql.Timestamp =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ s.getTime) ++ after.tail :+ s
        Row(result: _*)
      case d: Double =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ d.toLong) ++ after.tail :+ d
        Row(result: _*)
      case i: Int =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ i.toLong) ++ after.tail :+ i
        Row(result: _*)
      case s: Short =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ s.toLong) ++ after.tail :+ s
        Row(result: _*)
      case _ => row

  def convertColumnToLongAddAtEnd(row: Row, columnIndex: Int): Row = {
    val result = row.get(columnIndex) match {
      case s: java.sql.Timestamp =>
        val result = row.toSeq :+ s.getTime
        Row(result: _*)
      case d: Double =>
        val result = row.toSeq :+ d.toLong
        Row(result: _*)
      case i: Int =>
        val result = row.toSeq :+ i.toLong
        Row(result: _*)
      case s: Short =>
        val result = row.toSeq :+ s.toLong
        Row(result: _*)
      case _ => row

  def createIdColColumn(dataFrame : DataFrame, context : IUberdataContext) : DataFrame = {
    val arrId =
      x => x._1.toSeq :+ x._2
      x => Row.fromSeq(x))
      dataFrame.withColumn("idCol", lit(1L : Long)).schema)

Example 13
Source File: SuiteKickoff.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions.{col, lit}

import scala.collection.parallel.ForkJoinTaskSupport

object SuiteKickoff {
  private val log = org.slf4j.LoggerFactory.getLogger(getClass)

  def run(s: Suite, spark: SparkSession): Unit = {
    verifyOutput(s.benchmarkOutput, s.saveMode, spark)

    // Translate the maps into runnable workloads
    val workloads: Seq[Workload] =

    val dataframes: Seq[DataFrame] = (0 until s.repeat).flatMap { i =>
      // This will produce one DataFrame of one row for each workload in the sequence.
      // We're going to produce one coherent DF later from these
      val dfSeqFromOneRun: Seq[DataFrame] = {
        if (s.parallel) runParallel(workloads, spark)
        else runSerially(workloads, spark)
      // Indicate which run of this suite this was."run", lit(i)))

    // getting the Spark confs so we can output them in the results.
    val strSparkConfs = spark.conf.getAll

    // Ah, see, here's where we're joining that series of one-row DFs
    val singleDF = joinDataFrames(dataframes, spark)
    // And now we're going to curry in the results
    val plusSparkConf = addConfToResults(singleDF, strSparkConfs)
    val plusDescription = addConfToResults(plusSparkConf, Map("description" -> s.description)).coalesce(1)
    // And write to disk. We're done with this suite!
    if(s.benchmarkOutput.nonEmpty) writeToDisk(s.benchmarkOutput.get, s.saveMode, plusDescription, spark)

  private def runParallel(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = {
    val confSeqPar = workloadConfigs.par
    confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size))

  private def runSerially(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = {

  private def joinDataFrames(seq: Seq[DataFrame], spark: SparkSession): DataFrame = {
    if (seq.length == 1) seq.head
    else {
      val seqOfColNames =
      val allTheColumns = seqOfColNames.foldLeft(Set[String]())(_ ++ _)

      def expr(myCols: Set[String], allCols: Set[String]) = { {
          case x if myCols.contains(x) => col(x)
          case x => lit(null).as(x)

      val seqFixedDfs = =>, allTheColumns): _*))

      // Folding left across this sequence should be fine because each DF should only have 1 row
      // Nevarr Evarr do this to legit dataframes that are all like big and stuff
      seqFixedDfs.foldLeft(spark.createDataFrame(spark.sparkContext.emptyRDD[Row], seqFixedDfs.head.schema))(_ union _)
Example 14
Source File: StructuredStreamingWordCount.scala    From structured-streaming-application   with Apache License 2.0 5 votes vote down vote up
package knolx.spark

import com.datastax.driver.core.Cluster
import knolx.Config._
import knolx.KnolXLogger
import knolx.spark.CassandraForeachWriter.writeToCassandra
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, lit, sum}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StringType

object StructuredStreamingWordCount extends App with KnolXLogger {
  val cluster = Cluster.builder.addContactPoints(cassandraHosts).build
  val session = cluster.newSession()

  info("Creating Keypsace and tables in Cassandra...")
  session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " +
    "replication = {'class':'SimpleStrategy','replication_factor':1};")

  session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );")

  info("Closing DB connection...")

  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()

  info("Creating Streaming DF...")
  val dataStream =
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", topic)

  info("Writing data to Cassandra...")
  val query =
      .select(col("value").cast(StringType).as("word"), lit(1).as("count"))
      .option("checkpointLocation", checkPointDir)

  info("Waiting for the query to terminate...")
Example 15
Source File: AFTSurvivalRegressionParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up

import{OneHotEncoderEstimator, StringIndexer, VectorAssembler}
import{Pipeline, Transformer}
import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit

class AFTSurvivalRegressionParitySpec extends SparkParityBase {
  override val dataset: DataFrame ="fico_score_group_fnl", "dti", "loan_amount").withColumn("censor", lit(1.0))
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    new OneHotEncoderEstimator().
    new VectorAssembler().
      setInputCols(Array("fico", "dti")).
    new AFTSurvivalRegression().

  override val unserializedParams = Set("labelCol", "stringOrderType", "maxIter", "tol")
Example 16
Source File: WholeStageCodegenSparkSubmitSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers}
import org.scalatest.concurrent.TimeLimits

import org.apache.spark.{SparkFunSuite, TestUtils}
import org.apache.spark.deploy.SparkSubmitSuite
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{LocalSparkSession, QueryTest, Row, SparkSession}
import org.apache.spark.sql.functions.{array, col, count, lit}
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.unsafe.Platform
import org.apache.spark.util.ResetSystemProperties

// Due to the need to set driver's extraJavaOptions, this test needs to use actual SparkSubmit.
class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite
  with Matchers
  with BeforeAndAfterEach
  with ResetSystemProperties {

  test("Generated code on driver should not embed platform-specific constant") {
    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)

    // HotSpot JVM specific: Set up a local cluster with the driver/executor using mismatched
    // settings of UseCompressedOops JVM option.
    val argsForSparkSubmit = Seq(
      "--class", WholeStageCodegenSparkSubmitSuite.getClass.getName.stripSuffix("$"),
      "--master", "local-cluster[1,1,1024]",
      "--driver-memory", "1g",
      "--conf", "spark.ui.enabled=false",
      "--conf", "",
      "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops",
      "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops",
    SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..")

object WholeStageCodegenSparkSubmitSuite extends Assertions with Logging {

  var spark: SparkSession = _

  def main(args: Array[String]): Unit = {

    spark = SparkSession.builder().getOrCreate()

    // Make sure the test is run where the driver and the executors uses different object layouts
    val driverArrayHeaderSize = Platform.BYTE_ARRAY_OFFSET
    val executorArrayHeaderSize =
      spark.sparkContext.range(0, 1).map(_ => Platform.BYTE_ARRAY_OFFSET).collect.head.toInt
    assert(driverArrayHeaderSize > executorArrayHeaderSize)

    val df = spark.range(71773).select((col("id") % lit(10)).cast(IntegerType) as "v")
    val plan = df.queryExecution.executedPlan

    val expectedAnswer =
      Row(Array(0), 7178) ::
        Row(Array(1), 7178) ::
        Row(Array(2), 7178) ::
        Row(Array(3), 7177) ::
        Row(Array(4), 7177) ::
        Row(Array(5), 7177) ::
        Row(Array(6), 7177) ::
        Row(Array(7), 7177) ::
        Row(Array(8), 7177) ::
        Row(Array(9), 7177) :: Nil
    val result = df.collect
    QueryTest.sameRows(result.toSeq, expectedAnswer) match {
      case Some(errMsg) => fail(errMsg)
      case _ =>
Example 17
Source File: ConcatArrowAndExplodeSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.timeseries

import java.nio.channels.Channels
import java.util.concurrent.TimeUnit

import com.twosigma.flint.arrow.ArrowUtils
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.ipc.ArrowFileWriter
import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot }
import org.apache.spark.sql.functions.{ array, col, lit, struct }
import org.apache.spark.sql.types._

class ConcatArrowAndExplodeSpec extends TimeSeriesSuite {

  "ConcatArrowAndExplode" should "work" in {

    val batchSize = 10

    var df = spark.range(1000, 2000, 1000).toDF("time")
    val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v")))
    df = df.withColumn("base_rows", array(columns: _*))

    val allocator = new RootAllocator(Long.MaxValue)

    val schema1 = StructType(Seq(StructField("v1", DoubleType)))
    val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator)
    val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector]

    for (i <- 0 until batchSize) {
      vector1.set(i, i + 10.0)
    val out1 = new ByteArrayOutputStream()
    val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1))
    df = df.withColumn("f1_schema", struct(lit(0.0).as("v1")))
    df = df.withColumn("f1_data", lit(out1.toByteArray))

    val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType)))
    val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator)
    val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector]
    val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector]

    for (i <- 0 until batchSize) {
      vector2.set(i, i + 20.0)

    for (i <- 0 until batchSize) {
      vector3.set(i, i + 30L)
    val out2 = new ByteArrayOutputStream()
    val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2))
    df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3")))
    df = df.withColumn("f2_data", lit(out2.toByteArray))

    var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS)
    tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data"))

    var expected = spark.range(1000, 1000 + batchSize).toDF("time")
    expected = expected.withColumn("v", col("time") - 1000.0)
    expected = expected.withColumn("v1", col("time") - 1000 + 10.0)
    expected = expected.withColumn("v2", col("time") - 1000 + 20.0)
    expected = expected.withColumn("v3", col("time") - 1000 + 30)

    val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS)
    assertEquals(tsrdd, expectedTsrdd)

Example 18
Source File: UserActionsRateSource.scala    From spark-structured-streaming-examples   with Apache License 2.0 5 votes vote down vote up
package com.phylosoft.spark.learning.sql.streaming.source.rate

import org.apache.spark.sql.functions.{col, lit, pmod, rand}
import org.apache.spark.sql.{DataFrame, SparkSession}

class UserActionsRateSource(val spark: SparkSession,
                            val rowsPerSecond: String = "5",
                            val numPartitions: String = "1")
  extends RateSource {

  def loadUserActions(): DataFrame = {
      .where((rand() * 100).cast("integer") < 30) // 30 out of every 100 user actions
      .select(pmod(col("value"), lit(9)).as("userId"), col("timestamp").as("actionTime"))

Example 19
Source File: TestIndexing.scala    From spark-solr   with Apache License 2.0 5 votes vote down vote up
package com.lucidworks.spark

import java.util.UUID

import com.lucidworks.spark.util.SolrDataFrameImplicits._
import com.lucidworks.spark.util.{ConfigurationConstants, SolrCloudUtil, SolrQuerySupport, SolrSupport}
import org.apache.spark.sql.functions.{concat, lit}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}

class TestIndexing extends TestSuiteBuilder {

  test("Load csv file and index to Solr") {
    val collectionName = "testIndexing-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc)
    try {
      val csvFileLocation = "src/test/resources/test-data/nyc_yellow_taxi_sample_1k.csv"
      val csvDF ="com.databricks.spark.csv")
        .option("header", "true")
        .option("inferSchema", "true")
      assert(csvDF.count() == 999)

      val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName)
      val newDF = csvDF
        .withColumn("pickup_location", concat(csvDF.col("pickup_latitude"), lit(","), csvDF.col("pickup_longitude")))
        .withColumn("dropoff_location", concat(csvDF.col("dropoff_latitude"), lit(","), csvDF.col("dropoff_longitude")))
      newDF.write.option("zkhost", zkHost).option(ConfigurationConstants.GENERATE_UNIQUE_KEY, "true").solr(collectionName)

      // Explicit commit to make sure all docs are visible
      val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost)
      solrCloudClient.commit(collectionName, true, true)

      val solrDF ="solr").options(solrOpts).load()
      assert (solrDF.count() == 999)
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)

  test("Solr field types config") {
    val collectionName = "testIndexing-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc)
    try {
      val csvFileLocation = "src/test/resources/test-data/simple.csv"
      val csvDF ="com.databricks.spark.csv")
          .option("header", "true")
          .option("inferSchema", "true")
      val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName, ConfigurationConstants.SOLR_FIELD_TYPES -> "ntitle:text_en,nrating:string")

      // Explicit commit to make sure all docs are visible
      val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost)
      solrCloudClient.commit(collectionName, true, true)

      val solrBaseUrl = SolrSupport.getSolrBaseUrl(zkHost)
      val solrUrl = solrBaseUrl + collectionName + "/"

      val fieldTypes = SolrQuerySupport.getFieldTypes(Set.empty, solrUrl, cloudClient, collectionName)
      assert(fieldTypes("nrating").fieldType === "string")
      assert(fieldTypes("ntitle").fieldType === "text_en")
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)

  test("Field additions") {
    val insertSchema = StructType(Array(
      StructField("index_only_field", DataTypes.StringType, nullable = true),
      StructField("store_only_field", DataTypes.BooleanType, nullable = true),
      StructField("a_s", DataTypes.StringType, nullable = true),
      StructField("s_b", DataTypes.StringType, nullable = true)
    val collection = "testFieldAdditions" + UUID.randomUUID().toString.replace("-", "_")
    try {
      SolrCloudUtil.buildCollection(zkHost, collection, null, 2, cloudClient, sc)
      val opts = Map("zkhost" -> zkHost, "collection" -> collection)

      val solrRelation = new SolrRelation(opts, sparkSession)
      val fieldsToAdd = SolrRelation.getFieldsToAdd(insertSchema, solrRelation.conf, solrRelation.solrVersion, solrRelation.dynamicSuffixes)
    } finally {
      SolrCloudUtil.deleteCollection(collection, cluster)
