org.apache.spark.sql.functions.lit Scala Examples

The following examples show how to use org.apache.spark.sql.functions.lit. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: TestUtils.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.utils

import org.apache.spark.sql.functions.{col, count, lit}
import org.apache.spark.sql.{DataFrame, Row}

object TestUtils {

  implicit class ExtendedDataFrame(df: DataFrame) {

    def hasDiff(anotherDf: DataFrame): Boolean = {
      def printDiff(incoming: Boolean)(row: Row): Unit = {
        if (incoming) print("+ ") else print("- ")
        println(row)
      }

      val groupedDf = df.groupBy(df.columns.map(col): _*).agg(count(lit(1))).collect().toSet
      val groupedAnotherDf = anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet

      groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true))
      groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false))

      groupedDf.diff(groupedAnotherDf).nonEmpty || groupedAnotherDf.diff(groupedDf).nonEmpty
    }
  }
}

Example 2

Source File: Uniqueness.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{col, lit, sum}
import org.apache.spark.sql.types.DoubleType


case class Uniqueness(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil
  }

  override def filterCondition: Option[String] = where
}

object Uniqueness {
  def apply(column: String): Uniqueness = {
    new Uniqueness(column :: Nil)
  }

  def apply(column: String, where: Option[String]): Uniqueness = {
    new Uniqueness(column :: Nil, where)
  }
}

Example 3

Source File: UniqueValueRatio.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import com.amazon.deequ.metrics.DoubleMetric
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.{col, count, lit, sum}
import org.apache.spark.sql.types.DoubleType

case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
    val numUniqueValues = result.getDouble(offset)
    val numDistinctValues = result.getLong(offset + 1).toDouble

    toSuccessMetric(numUniqueValues / numDistinctValues)
  }

  override def filterCondition: Option[String] = where
}

object UniqueValueRatio {
  def apply(column: String): UniqueValueRatio = {
    new UniqueValueRatio(column :: Nil)
  }

  def apply(column: String, where: Option[String]): UniqueValueRatio = {
    new UniqueValueRatio(column :: Nil, where)
  }
}

Example 4

Source File: EnrichPostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.postprocessors

import java.util

import com.typesafe.config.Config
import it.agilelab.bigdata.DataQuality.checks.CheckResult
import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException
import it.agilelab.bigdata.DataQuality.metrics.MetricResult
import it.agilelab.bigdata.DataQuality.sources.HdfsFile
import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig
import it.agilelab.bigdata.DataQuality.utils
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.collection.JavaConversions._
import scala.util.Try

final class EnrichPostprocessor(config: Config, settings: DQSettings)
    extends BasicPostprocessor(config, settings) {

  private val vs: Option[String] = Try(config.getString("source")).toOption
  private val metrics: util.List[String] = config.getStringList("metrics")
  private val checks: util.List[String] = config.getStringList("checks")
  private val extra = config.getObject("extra").toMap

  private val target: HdfsTargetConfig = {
    val conf = config.getConfig("saveTo")
    utils.parseTargetConfig(conf)(settings).get
  }

  override def process(vsRef: Set[HdfsFile],
                       metRes: Seq[MetricResult],
                       chkRes: Seq[CheckResult])(
      implicit fs: FileSystem,
      sqlContext: SQLContext,
      settings: DQSettings): HdfsFile = {

    import sqlContext.implicits._

    val df: DataFrame = vs match {
      case Some(vsource) =>
        val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vsource).head
        HdfsReader.load(reqVS, settings.ref_date).head
      case None =>
        sqlContext.sparkContext.parallelize(Seq(1)).toDF("teapot")
    }

    val reqMet: Seq[(String, Double)] = metRes
      .filter(mr => metrics.contains(mr.metricId))
      .map(mr => mr.metricId -> mr.result)
    val reqCheck: Seq[(String, String)] = chkRes
      .filter(cr => checks.contains(cr.checkId))
      .map(cr => cr.checkId -> cr.status)

    if (reqMet.size != metrics.size())
      throw IllegalParameterException("Some of stated metrics are missing!")
    if (reqCheck.size != checks.size())
      throw IllegalParameterException("Some of stated checks are missing!")

    val dfWithMet: DataFrame =
      reqMet.foldLeft(df)((df, met) => df.withColumn(met._1, lit(met._2)))
    val dfWithChecks = reqCheck.foldLeft(dfWithMet)((df, met) =>
      df.withColumn(met._1, lit(met._2)))
    val dfWithExtra = extra.foldLeft(dfWithChecks)((df, ex) =>
      df.withColumn(ex._1, lit(ex._2.unwrapped())))

    HdfsWriter.saveVirtualSource(
      dfWithExtra.drop("teapot"),
      target,
      settings.refDateString)(fs, sqlContext.sparkContext)

    new HdfsFile(target)
  }
}

Example 5

Source File: SchemaColumnFixed.scala From data-faker with MIT License

5 votes

package com.dunnhumby.datafaker.schema.table.columns

import java.sql.{Date, Timestamp}
import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.lit

case class SchemaColumnFixed[T](override val name: String, value: T) extends SchemaColumn {
  override def column(rowID: Option[Column] = None): Column = lit(value)
}

object SchemaColumnFixedProtocol extends SchemaColumnFixedProtocol
trait SchemaColumnFixedProtocol extends YamlParserProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnFixedFormat extends YamlFormat[SchemaColumnFixed[_]] {

    override def read(yaml: YamlValue): SchemaColumnFixed[_] = {
      val fields = yaml.asYamlObject.fields
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name"))
      val value = fields.getOrElse(YamlString("value"), deserializationError(s"value not set for $name"))

      dataType match {
        case SchemaColumnDataType.Int => SchemaColumnFixed(name, value.convertTo[Int])
        case SchemaColumnDataType.Long => SchemaColumnFixed(name, value.convertTo[Long])
        case SchemaColumnDataType.Float => SchemaColumnFixed(name, value.convertTo[Float])
        case SchemaColumnDataType.Double => SchemaColumnFixed(name, value.convertTo[Double])
        case SchemaColumnDataType.Date => SchemaColumnFixed(name, value.convertTo[Date])
        case SchemaColumnDataType.Timestamp => SchemaColumnFixed(name, value.convertTo[Timestamp])
        case SchemaColumnDataType.String => SchemaColumnFixed(name, value.convertTo[String])
        case SchemaColumnDataType.Boolean => SchemaColumnFixed(name, value.convertTo[Boolean])
        case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Fixed}")
      }

    }

    override def write(obj: SchemaColumnFixed[_]): YamlValue = ???

  }

}

Example 6

Source File: SimpleJsonIngestionJob.scala From comet-data-pipeline with Apache License 2.0

5 votes

package com.ebiznext.comet.job.ingest

import com.ebiznext.comet.config.Settings
import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler}
import com.ebiznext.comet.schema.model._
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{DataFrame, Encoders}

import scala.util.{Failure, Success, Try}


class SimpleJsonIngestionJob(
  domain: Domain,
  schema: Schema,
  types: List[Type],
  path: List[Path],
  storageHandler: StorageHandler,
  schemaHandler: SchemaHandler
)(implicit settings: Settings)
    extends DsvIngestionJob(domain, schema, types, path, storageHandler, schemaHandler) {

  override def loadDataSet(): Try[DataFrame] = {
    try {

      val df =
        if (metadata.isArray()) {
          val jsonRDD =
            session.sparkContext.wholeTextFiles(path.map(_.toString).mkString(",")).map(_._2)

          session.read
            .json(session.createDataset(jsonRDD)(Encoders.STRING))
            .withColumn(
              //  Spark cannot detect the input file automatically, so we should add it explicitly
              Settings.cometInputFileNameColumn,
              if (settings.comet.grouped) lit(path.map(_.toString).mkString(","))
              else lit(path.head.toString)
            )

        } else {
          session.read
            .option("encoding", metadata.getEncoding())
            .option("multiline", metadata.getMultiline())
            .json(path.map(_.toString): _*)
            .withColumn(
              //  Spark here can detect the input file automatically, so we're just using the input_file_name spark function
              Settings.cometInputFileNameColumn,
              org.apache.spark.sql.functions.input_file_name()
            )
        }

      import session.implicits._
      val resDF = if (df.columns.contains("_corrupt_record")) {
        //TODO send rejected records to rejected area
        logger.whenDebugEnabled {
          df.filter($"_corrupt_record".isNotNull).show(1000, false)
        }
        throw new Exception(
          s"""Invalid JSON File: ${path
            .map(_.toString)
            .mkString(",")}. SIMPLE_JSON require a valid json file """
        )
      } else {
        df
      }
      Success(
        resDF
      )
    } catch {
      case e: Exception =>
        Failure(e)
    }
  }
}

Example 7

Source File: PowerBiSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split1

import java.io.File

import com.microsoft.ml.spark.Secrets
import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.io.powerbi.PowerBIWriter
import org.apache.spark.SparkException
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.functions.{current_timestamp, lit}

import scala.collection.JavaConverters._

class PowerBiSuite extends TestBase with FileReaderUtils {

  lazy val url: String = sys.env.getOrElse("MML_POWERBI_URL", Secrets.PowerbiURL)
  lazy val df: DataFrame = session
    .createDataFrame(Seq(
      (Some(0), "a"),
      (Some(1), "b"),
      (Some(2), "c"),
      (Some(3), ""),
      (None, "bad_row")))
    .toDF("bar", "foo")
    .withColumn("baz", current_timestamp())
  lazy val bigdf: DataFrame = (1 to 5).foldRight(df) { case (_, ldf) => ldf.union(df) }.repartition(2)
  lazy val delayDF: DataFrame = {
    val rows = Array.fill(100){df.collect()}.flatten.toList.asJava
    val df2 = session
      .createDataFrame(rows, df.schema)
      .coalesce(1).cache()
    df2.count()
    df2.map({x => Thread.sleep(10); x})(RowEncoder(df2.schema))
  }

  test("write to powerBi", TestBase.BuildServer) {
    PowerBIWriter.write(df, url)
  }

  test("write to powerBi with delays"){
    PowerBIWriter.write(delayDF, url)
  }

  test("using dynamic minibatching"){
    PowerBIWriter.write(delayDF, url, Map("minibatcher"->"dynamic", "maxBatchSize"->"50"))
  }

  test("using timed minibatching"){
    PowerBIWriter.write(delayDF, url, Map("minibatcher"->"timed"))
  }

  test("using consolidated timed minibatching"){
    PowerBIWriter.write(delayDF, url, Map(
      "minibatcher"->"timed",
      "consolidate"->"true"))
  }

  test("using buffered batching"){
    PowerBIWriter.write(delayDF, url, Map("buffered"->"true"))
  }

  ignore("throw useful error message when given an improper dataset") {
    //TODO figure out why this does not throw errors on the build machine
    assertThrows[SparkException] {
      PowerBIWriter.write(df.withColumn("bad", lit("foo")), url)
    }
  }

  test("stream to powerBi", TestBase.BuildServer) {
    bigdf.write.parquet(tmpDir + File.separator + "powerBI.parquet")
    val sdf = session.readStream.schema(df.schema).parquet(tmpDir + File.separator + "powerBI.parquet")
    val q1 = PowerBIWriter.stream(sdf, url).start()
    q1.processAllAvailable()
  }

}

Example 8

Source File: ServingUDFs.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package org.apache.spark.sql.execution.streaming

import com.microsoft.ml.spark.io.http.HTTPResponseData
import com.microsoft.ml.spark.io.http.HTTPSchema.{binary_to_response, empty_response, string_to_response}
import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{lit, struct, to_json, udf}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, Row}

import scala.util.Try

object ServingUDFs {

  private def jsonReply(c: Column) = string_to_response(to_json(c))

  def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = {
    dt match {
      case NullType => empty_response(code, reason)
      case StringType => string_to_response(data, code, reason)
      case BinaryType => binary_to_response(data)
      case _: StructType => jsonReply(data)
      case _: MapType => jsonReply(data)
      case at: ArrayType => at.elementType match {
        case _: StructType => jsonReply(data)
        case _: MapType => jsonReply(data)
        case _ => jsonReply(struct(data))
      }
      case _ => jsonReply(struct(data))
    }
  }

  private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = {
    if (Option(reply).isEmpty || Option(id).isEmpty) {
      null.asInstanceOf[Boolean] //scalastyle:ignore null
    } else {
      Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply)))
        .toOption.isDefined
    }
  }

  def sendReplyUDF: UserDefinedFunction = {
    val toData = HTTPResponseData.makeFromRowConverter
    udf(sendReplyHelper(toData) _, BooleanType)
  }

}

Example 9

Source File: DeltaSourceSnapshot.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.files

import org.apache.spark.sql.delta.{DeltaLog, DeltaTableUtils, Snapshot}
import org.apache.spark.sql.delta.sources.IndexedFile
import org.apache.spark.sql.delta.util.StateCache

import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.functions.lit


class DeltaSourceSnapshot(
    val spark: SparkSession,
    val snapshot: Snapshot,
    val filters: Seq[Expression])
  extends SnapshotIterator
  with StateCache {

  protected val version = snapshot.version
  protected val path = snapshot.path

  protected lazy val (partitionFilters, dataFilters) = {
    val partitionCols = snapshot.metadata.partitionColumns
    filters.partition { e =>
      DeltaTableUtils.isPredicatePartitionColumnsOnly(e, partitionCols, spark)
    }
  }

  protected def initialFiles: Dataset[IndexedFile] = {
    import spark.implicits._

    cacheDS(
      snapshot.allFiles.sort("modificationTime", "path")
        .rdd.zipWithIndex()
        .toDF("add", "index")
        .withColumn("version", lit(version))
        .withColumn("isLast", lit(false))
        .as[IndexedFile],
      s"Delta Source Snapshot #$version - ${snapshot.redactedPath}").getDS
  }

  override def close(unpersistSnapshot: Boolean): Unit = {
    super.close(unpersistSnapshot)

    if (unpersistSnapshot) {
      snapshot.uncache()
    }
  }
}

trait SnapshotIterator {
  self: DeltaSourceSnapshot =>

  private var result: Iterable[IndexedFile] = _

  def iterator(): Iterator[IndexedFile] = {
    import spark.implicits._
    if (result == null) {
      result = DeltaLog.filterFileList(
        snapshot.metadata.partitionSchema,
        initialFiles.toDF(),
        partitionFilters,
        Seq("add")).as[IndexedFile].collect().toIterable
    }
    // This will always start from the beginning and re-use resources. If any exceptions were to
    // be thrown, the stream would stop, we would call stop on the source, and that will make
    // sure that we clean up resources.
    result.toIterator
  }

  def close(unpersistSnapshot: Boolean): Unit = { }
}

Example 10

Source File: GpuDSArrayMult.scala From GPUEnabler with Apache License 2.0

5 votes

package com.ibm.gpuenabler

import org.apache.spark.SparkEnv
import org.apache.spark.sql.functions.lit
import com.ibm.gpuenabler.CUDADSImplicits._

object GpuDSArrayMult {

  case class jsonData(name : String, factor: Long, arr: Array[Long])
  case class inputData(name : String, factor: Long, arr: Array[Long], result: Array[Long])
  case class outputData(name: String, result: Array[Long])

  def main(args : Array[String]): Unit = {

    val ss = org.apache.spark.sql.SparkSession.builder.master("local[*]").appName("test").getOrCreate()
    import ss.implicits._
    if(args.length > 0) {
      println("Setting debug Mode" + args(0))
      SparkEnv.get.conf.set("DebugMode", args(0))
    }

    val ptxURL = "/GpuEnablerExamples.ptx"

    // 1. Sample Map Operation - multiple every element in the array by 2
    val mulFunc = DSCUDAFunction("multiplyBy2", Seq("value"), Seq("value"), ptxURL)

    val N: Long = 100000

    val dataPts = ss.range(1, N+1, 1, 10).cache
    val results = dataPts.mapExtFunc(_ * 2, mulFunc).collect()
    println("Count is " + results.length)
    assert(results.length == N)

    val expResults = (1 to N.toInt).map(_ * 2)
    assert(results.sameElements(expResults))

    // 2. Sample Reduce Operation - Sum of all elements in the array
    val dimensions = (size: Long, stage: Int) => stage match {
      case 0 => (64, 256, 1, 1, 1, 1)
      case 1 => (1, 1, 1, 1, 1, 1)
    }

    val gpuParams = gpuParameters(dimensions)

    val sumFunc = DSCUDAFunction(
      "suml",
      Array("value"),
      Array("value"),
      ptxURL,
      Some((size: Long) => 2),
      Some(gpuParams), outputSize=Some(1))

    val results2 = dataPts
          .mapExtFunc(_ * 2, mulFunc)
          .reduceExtFunc(_ + _, sumFunc)

    println("Output is "+ results2)
    println("Expected is " + (N * (N + 1)))
    assert(results2 == N * (N + 1))

    // 3. Dataset - GPU Map - Dataset Operation.
    val ds = ss.read.json("src/main/resources/data.json").as[jsonData]

    val dds = ds.withColumn("result", lit(null: Array[Double] )).as[inputData]
    
    val dsFunc = DSCUDAFunction("arrayTest", Seq("factor", "arr"), Seq("result"), ptxURL)

    val mapDS = dds.mapExtFunc(x => outputData(x.name, x.result),
      dsFunc,
      Array((1 to 10).map(_ * 3).toArray, (1 to 35).map(_.toLong).toArray),
      outputArraySizes = Array(3))

    mapDS.select($"name", $"result").show()

  }
}

Example 11

Source File: AnyValInstances.scala From cleanframes with Apache License 2.0

5 votes

package cleanframes.instances

import cleanframes.Cleaner
import org.apache.spark.sql.functions.{lower, trim, when, lit}
import org.apache.spark.sql.types._

trait AnyValInstances
  extends IntInstances
    with ByteInstances
    with CharInstances
    with ShortInstances
    with LongInstances
    with FloatInstances
    with DoubleInstances
    with BooleanInstances
    with NumericAnyValInstance

trait IntInstances {
  implicit lazy val integerType: SparkDataType[Int] = new SparkDataType[Int] {
    override def getDataType: DataType = IntegerType
  }
}

trait ByteInstances {
  implicit lazy val byteType: SparkDataType[Byte] = new SparkDataType[Byte] {
    override def getDataType: DataType = ByteType
  }
}

trait CharInstances {
  implicit val stdStringToChar: String => Char = _.charAt(0)
}

trait ShortInstances {
  implicit lazy val shortType: SparkDataType[Short] = new SparkDataType[Short] {
    override def getDataType: DataType = ShortType
  }
}

trait LongInstances {
  implicit lazy val longType: SparkDataType[Long] = new SparkDataType[Long] {
    override def getDataType: DataType = LongType
  }
}

trait FloatInstances {
  implicit lazy val floatType: SparkDataType[Float] = new SparkDataType[Float] {
    override def getDataType: DataType = FloatType
  }
}

trait DoubleInstances {
  implicit lazy val doubleType: SparkDataType[Double] = new SparkDataType[Double] {
    override def getDataType: DataType = DoubleType
  }
}

trait BooleanInstances {
  implicit lazy val booleanCleaner: Cleaner[Option[Boolean]] = {
    Cleaner.materialize { (frame, name, alias) =>
      List(
        when(
          trim(lower(frame.col(name.get))) === "true",
          lit(true) cast BooleanType
        ).otherwise(false) as alias.get
      )
    }
  }
}

Example 12

Source File: IUberdataForecastUtil.scala From uberdata with Apache License 2.0

5 votes

package eleflow.uberdata

import eleflow.uberdata.core.IUberdataContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions.lit


object IUberdataForecastUtil {

  lazy val FEATURES_PREDICTION_COL_NAME = "featuresPrediction"
  lazy val FEATURES_COL_NAME = "features"
  lazy val ALGORITHM = "algorithm"
  lazy val PARAMS = "parameters"
  lazy val METRIC_COL_NAME = "metric"

  def convertColumnToLong(row: Row, columnIndex: Int): Row = {
    row.get(columnIndex) match {
      case s: java.sql.Timestamp =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ s.getTime) ++ after.tail :+ s
        Row(result: _*)
      case d: Double =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ d.toLong) ++ after.tail :+ d
        Row(result: _*)
      case i: Int =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ i.toLong) ++ after.tail :+ i
        Row(result: _*)
      case s: Short =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ s.toLong) ++ after.tail :+ s
        Row(result: _*)
      case _ => row
    }
  }

  def convertColumnToLongAddAtEnd(row: Row, columnIndex: Int): Row = {
    val result = row.get(columnIndex) match {
      case s: java.sql.Timestamp =>
        val result = row.toSeq :+ s.getTime
        Row(result: _*)
      case d: Double =>
        val result = row.toSeq :+ d.toLong
        Row(result: _*)
      case i: Int =>
        val result = row.toSeq :+ i.toLong
        Row(result: _*)
      case s: Short =>
        val result = row.toSeq :+ s.toLong
        Row(result: _*)
      case _ => row
    }
    result
  }

  def createIdColColumn(dataFrame : DataFrame, context : IUberdataContext) : DataFrame = {
    val arrId = dataFrame.rdd.zipWithIndex.map(
      x => x._1.toSeq :+ x._2
    ).map(
      x => Row.fromSeq(x))
    context.sqlContext.createDataFrame(arrId,
      dataFrame.withColumn("idCol", lit(1L : Long)).schema)
  }

}

Example 13

Source File: SuiteKickoff.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.workload

import com.ibm.sparktc.sparkbench.utils.SparkFuncs._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions.{col, lit}

import scala.collection.parallel.ForkJoinTaskSupport



object SuiteKickoff {
  private val log = org.slf4j.LoggerFactory.getLogger(getClass)

  def run(s: Suite, spark: SparkSession): Unit = {
    verifyOutput(s.benchmarkOutput, s.saveMode, spark)

    // Translate the maps into runnable workloads
    val workloads: Seq[Workload] = s.workloadConfigs.map(ConfigCreator.mapToConf)

    val dataframes: Seq[DataFrame] = (0 until s.repeat).flatMap { i =>
      // This will produce one DataFrame of one row for each workload in the sequence.
      // We're going to produce one coherent DF later from these
      val dfSeqFromOneRun: Seq[DataFrame] = {
        if (s.parallel) runParallel(workloads, spark)
        else runSerially(workloads, spark)
      }
      // Indicate which run of this suite this was.
      dfSeqFromOneRun.map(_.withColumn("run", lit(i)))
    }

    // getting the Spark confs so we can output them in the results.
    val strSparkConfs = spark.conf.getAll

    // Ah, see, here's where we're joining that series of one-row DFs
    val singleDF = joinDataFrames(dataframes, spark)
    s.description.foreach(log.info)
    // And now we're going to curry in the results
    val plusSparkConf = addConfToResults(singleDF, strSparkConfs)
    val plusDescription = addConfToResults(plusSparkConf, Map("description" -> s.description)).coalesce(1)
    // And write to disk. We're done with this suite!
    if(s.benchmarkOutput.nonEmpty) writeToDisk(s.benchmarkOutput.get, s.saveMode, plusDescription, spark)
  }

  private def runParallel(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = {
    val confSeqPar = workloadConfigs.par
    confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size))
    confSeqPar.map(_.run(spark)).seq
  }

  private def runSerially(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = {
    workloadConfigs.map(_.run(spark))
  }

  private def joinDataFrames(seq: Seq[DataFrame], spark: SparkSession): DataFrame = {
    if (seq.length == 1) seq.head
    else {
      val seqOfColNames = seq.map(_.columns.toSet)
      val allTheColumns = seqOfColNames.foldLeft(Set[String]())(_ ++ _)

      def expr(myCols: Set[String], allCols: Set[String]) = {
        allCols.toList.map {
          case x if myCols.contains(x) => col(x)
          case x => lit(null).as(x)
        }
      }

      val seqFixedDfs = seq.map(df => df.select(expr(df.columns.toSet, allTheColumns): _*))

      // Folding left across this sequence should be fine because each DF should only have 1 row
      // Nevarr Evarr do this to legit dataframes that are all like big and stuff
      seqFixedDfs.foldLeft(spark.createDataFrame(spark.sparkContext.emptyRDD[Row], seqFixedDfs.head.schema))(_ union _)
    }
  }
}

Example 14

Source File: StructuredStreamingWordCount.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import com.datastax.driver.core.Cluster
import knolx.Config._
import knolx.KnolXLogger
import knolx.spark.CassandraForeachWriter.writeToCassandra
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, lit, sum}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StringType


object StructuredStreamingWordCount extends App with KnolXLogger {
  val cluster = Cluster.builder.addContactPoints(cassandraHosts).build
  val session = cluster.newSession()

  info("Creating Keypsace and tables in Cassandra...")
  session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " +
    "replication = {'class':'SimpleStrategy','replication_factor':1};")

  session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );")

  info("Closing DB connection...")
  session.close()
  session.getCluster.close()

  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  info("Creating Streaming DF...")
  val dataStream =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", topic)
      .load()

  info("Writing data to Cassandra...")
  val query =
    dataStream
      .select(col("value").cast(StringType).as("word"), lit(1).as("count"))
      .groupBy(col("word"))
      .agg(sum("count").as("count"))
      .writeStream
      .outputMode(OutputMode.Update())
      .foreach(writeToCassandra)
      .option("checkpointLocation", checkPointDir)
      .start()

  info("Waiting for the query to terminate...")
  query.awaitTermination()
  query.stop()
}

Example 15

Source File: AFTSurvivalRegressionParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.parity.regression

import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.regression.AFTSurvivalRegression
import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit


class AFTSurvivalRegressionParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").withColumn("censor", lit(1.0))
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    setInputCol("fico_score_group_fnl").
    setOutputCol("fico_index"),
    new OneHotEncoderEstimator().
      setInputCols(Array("fico_index")).
      setOutputCols(Array("fico")),
    new VectorAssembler().
      setInputCols(Array("fico", "dti")).
      setOutputCol("features"),
    new AFTSurvivalRegression().
      setQuantileProbabilities(Array(0.5)).
      setFeaturesCol("features").
      setLabelCol("loan_amount").
      setQuantilesCol("quant").
      setPredictionCol("prediction"))).fit(dataset)

  override val unserializedParams = Set("labelCol", "stringOrderType", "maxIter", "tol")
}

Example 16

Source File: WholeStageCodegenSparkSubmitSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers}
import org.scalatest.concurrent.TimeLimits

import org.apache.spark.{SparkFunSuite, TestUtils}
import org.apache.spark.deploy.SparkSubmitSuite
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{LocalSparkSession, QueryTest, Row, SparkSession}
import org.apache.spark.sql.functions.{array, col, count, lit}
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.unsafe.Platform
import org.apache.spark.util.ResetSystemProperties

// Due to the need to set driver's extraJavaOptions, this test needs to use actual SparkSubmit.
class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite
  with Matchers
  with BeforeAndAfterEach
  with ResetSystemProperties {

  test("Generated code on driver should not embed platform-specific constant") {
    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)

    // HotSpot JVM specific: Set up a local cluster with the driver/executor using mismatched
    // settings of UseCompressedOops JVM option.
    val argsForSparkSubmit = Seq(
      "--class", WholeStageCodegenSparkSubmitSuite.getClass.getName.stripSuffix("$"),
      "--master", "local-cluster[1,1,1024]",
      "--driver-memory", "1g",
      "--conf", "spark.ui.enabled=false",
      "--conf", "spark.master.rest.enabled=false",
      "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops",
      "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops",
      unusedJar.toString)
    SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..")
  }
}

object WholeStageCodegenSparkSubmitSuite extends Assertions with Logging {

  var spark: SparkSession = _

  def main(args: Array[String]): Unit = {
    TestUtils.configTestLog4j("INFO")

    spark = SparkSession.builder().getOrCreate()

    // Make sure the test is run where the driver and the executors uses different object layouts
    val driverArrayHeaderSize = Platform.BYTE_ARRAY_OFFSET
    val executorArrayHeaderSize =
      spark.sparkContext.range(0, 1).map(_ => Platform.BYTE_ARRAY_OFFSET).collect.head.toInt
    assert(driverArrayHeaderSize > executorArrayHeaderSize)

    val df = spark.range(71773).select((col("id") % lit(10)).cast(IntegerType) as "v")
      .groupBy(array(col("v"))).agg(count(col("*")))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)

    val expectedAnswer =
      Row(Array(0), 7178) ::
        Row(Array(1), 7178) ::
        Row(Array(2), 7178) ::
        Row(Array(3), 7177) ::
        Row(Array(4), 7177) ::
        Row(Array(5), 7177) ::
        Row(Array(6), 7177) ::
        Row(Array(7), 7177) ::
        Row(Array(8), 7177) ::
        Row(Array(9), 7177) :: Nil
    val result = df.collect
    QueryTest.sameRows(result.toSeq, expectedAnswer) match {
      case Some(errMsg) => fail(errMsg)
      case _ =>
    }
  }
}

Example 17

Source File: ConcatArrowAndExplodeSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries

import java.io.ByteArrayOutputStream
import java.nio.channels.Channels
import java.util.concurrent.TimeUnit

import com.twosigma.flint.arrow.ArrowUtils
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.ipc.ArrowFileWriter
import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot }
import org.apache.spark.sql.functions.{ array, col, lit, struct }
import org.apache.spark.sql.types._

class ConcatArrowAndExplodeSpec extends TimeSeriesSuite {

  "ConcatArrowAndExplode" should "work" in {

    val batchSize = 10

    var df = spark.range(1000, 2000, 1000).toDF("time")
    val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v")))
    df = df.withColumn("base_rows", array(columns: _*))

    val allocator = new RootAllocator(Long.MaxValue)

    val schema1 = StructType(Seq(StructField("v1", DoubleType)))
    val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator)
    val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector]
    vector1.allocateNew()

    for (i <- 0 until batchSize) {
      vector1.set(i, i + 10.0)
    }
    vector1.setValueCount(batchSize)
    val out1 = new ByteArrayOutputStream()
    val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1))
    arrowWriter1.writeBatch()
    arrowWriter1.close()
    root1.close()
    df = df.withColumn("f1_schema", struct(lit(0.0).as("v1")))
    df = df.withColumn("f1_data", lit(out1.toByteArray))

    val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType)))
    val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator)
    val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector]
    val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector]
    vector2.allocateNew()
    vector3.allocateNew()

    for (i <- 0 until batchSize) {
      vector2.set(i, i + 20.0)
    }
    vector2.setValueCount(batchSize)

    for (i <- 0 until batchSize) {
      vector3.set(i, i + 30L)
    }
    vector3.setValueCount(batchSize)
    val out2 = new ByteArrayOutputStream()
    val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2))
    arrowWriter2.writeBatch()
    arrowWriter2.close()
    root2.close()
    df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3")))
    df = df.withColumn("f2_data", lit(out2.toByteArray))

    var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS)
    tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data"))
    tsrdd.toDF.show()

    var expected = spark.range(1000, 1000 + batchSize).toDF("time")
    expected = expected.withColumn("v", col("time") - 1000.0)
    expected = expected.withColumn("v1", col("time") - 1000 + 10.0)
    expected = expected.withColumn("v2", col("time") - 1000 + 20.0)
    expected = expected.withColumn("v3", col("time") - 1000 + 30)

    val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS)
    assertEquals(tsrdd, expectedTsrdd)
  }

}

Example 18

Source File: UserActionsRateSource.scala From spark-structured-streaming-examples with Apache License 2.0

5 votes

package com.phylosoft.spark.learning.sql.streaming.source.rate

import org.apache.spark.sql.functions.{col, lit, pmod, rand}
import org.apache.spark.sql.{DataFrame, SparkSession}


class UserActionsRateSource(val spark: SparkSession,
                            val rowsPerSecond: String = "5",
                            val numPartitions: String = "1")
  extends RateSource {

  def loadUserActions(): DataFrame = {
    readStream()
      .where((rand() * 100).cast("integer") < 30) // 30 out of every 100 user actions
      .select(pmod(col("value"), lit(9)).as("userId"), col("timestamp").as("actionTime"))
  }

}

Example 19

Source File: TestIndexing.scala From spark-solr with Apache License 2.0

5 votes

package com.lucidworks.spark

import java.util.UUID

import com.lucidworks.spark.util.SolrDataFrameImplicits._
import com.lucidworks.spark.util.{ConfigurationConstants, SolrCloudUtil, SolrQuerySupport, SolrSupport}
import org.apache.spark.sql.functions.{concat, lit}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}

class TestIndexing extends TestSuiteBuilder {

  test("Load csv file and index to Solr") {
    val collectionName = "testIndexing-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc)
    try {
      val csvFileLocation = "src/test/resources/test-data/nyc_yellow_taxi_sample_1k.csv"
      val csvDF = sparkSession.read.format("com.databricks.spark.csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(csvFileLocation)
      assert(csvDF.count() == 999)

      val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName)
      val newDF = csvDF
        .withColumn("pickup_location", concat(csvDF.col("pickup_latitude"), lit(","), csvDF.col("pickup_longitude")))
        .withColumn("dropoff_location", concat(csvDF.col("dropoff_latitude"), lit(","), csvDF.col("dropoff_longitude")))
      newDF.write.option("zkhost", zkHost).option(ConfigurationConstants.GENERATE_UNIQUE_KEY, "true").solr(collectionName)

      // Explicit commit to make sure all docs are visible
      val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost)
      solrCloudClient.commit(collectionName, true, true)

      val solrDF = sparkSession.read.format("solr").options(solrOpts).load()
      solrDF.printSchema()
      assert (solrDF.count() == 999)
      solrDF.take(10)
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)
    }
  }

  test("Solr field types config") {
    val collectionName = "testIndexing-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc)
    try {
      val csvFileLocation = "src/test/resources/test-data/simple.csv"
      val csvDF = sparkSession.read.format("com.databricks.spark.csv")
          .option("header", "true")
          .option("inferSchema", "true")
          .load(csvFileLocation)
      val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName, ConfigurationConstants.SOLR_FIELD_TYPES -> "ntitle:text_en,nrating:string")
      csvDF.write.options(solrOpts).solr(collectionName)

      // Explicit commit to make sure all docs are visible
      val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost)
      solrCloudClient.commit(collectionName, true, true)

      val solrBaseUrl = SolrSupport.getSolrBaseUrl(zkHost)
      val solrUrl = solrBaseUrl + collectionName + "/"

      val fieldTypes = SolrQuerySupport.getFieldTypes(Set.empty, solrUrl, cloudClient, collectionName)
      assert(fieldTypes("nrating").fieldType === "string")
      assert(fieldTypes("ntitle").fieldType === "text_en")
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)
    }
  }


  test("Field additions") {
    val insertSchema = StructType(Array(
      StructField("index_only_field", DataTypes.StringType, nullable = true),
      StructField("store_only_field", DataTypes.BooleanType, nullable = true),
      StructField("a_s", DataTypes.StringType, nullable = true),
      StructField("s_b", DataTypes.StringType, nullable = true)
    ))
    val collection = "testFieldAdditions" + UUID.randomUUID().toString.replace("-", "_")
    try {
      SolrCloudUtil.buildCollection(zkHost, collection, null, 2, cloudClient, sc)
      val opts = Map("zkhost" -> zkHost, "collection" -> collection)

      val solrRelation = new SolrRelation(opts, sparkSession)
      val fieldsToAdd = SolrRelation.getFieldsToAdd(insertSchema, solrRelation.conf, solrRelation.solrVersion, solrRelation.dynamicSuffixes)
      assert(fieldsToAdd.isEmpty)
    } finally {
      SolrCloudUtil.deleteCollection(collection, cluster)
    }
  }

}