org.apache.spark.sql.functions Scala Examples

The following examples show how to use org.apache.spark.sql.functions. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: NestedCaseClassesTest.scala From cleanframes with Apache License 2.0

8 votes

package cleanframes

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.sql.functions
import org.scalatest.{FlatSpec, Matchers}


class NestedCaseClassesTest
  extends FlatSpec
    with Matchers
    with DataFrameSuiteBase {

  "Cleaner" should "compile and use a custom transformer for a custom type" in {
    import cleanframes.syntax._ // to use `.clean`
    import spark.implicits._

    // define test data for a dataframe
    val input = Seq(
      // @formatter:off
      ("1",           "1",          "1",           "1",           null),
      (null,          "2",          null,          "2",           "corrupted"),
      ("corrupted",   null,         "corrupted",   null,          "true"),
      ("4",           "corrupted",  "4",           "4",           "false"),
      ("5",           "5",          "5",           "corrupted",   "false"),
      ("6",           "6",          "6",           "6",           "true")
      // @formatter:on
    )
      // give column names that are known to you
      .toDF("col1", "col2", "col3", "col4", "col5")

    // import standard functions for conversions shipped with the library
    import cleanframes.instances.all._

    // !important: you need to give a new structure to allow to access sub elements
    val renamed = input.select(
      functions.struct(
        input.col("col1") as "a_col_1",
        input.col("col2") as "a_col_2"
      ) as "a",
      functions.struct(
        input.col("col3") as "b_col_1",
        input.col("col4") as "b_col_2"
      ) as "b",
      input.col("col5") as "c"
    )

    val result = renamed.clean[AB]
      .as[AB]
      .collect

    result should {
      contain theSameElementsAs Seq(
        // @formatter:off
        AB( A(Some(1), Some(1)),  B(Some(1),  Some(1.0)), Some(false)),
        AB( A(None,    Some(2)),  B(None,     Some(2.0)), Some(false)),
        AB( A(None,    None),     B(None,     None),      Some(true)),
        AB( A(Some(4), None),     B(Some(4),  Some(4.0)), Some(false)),
        AB( A(Some(5), Some(5)),  B(Some(5),  None),      Some(false)),
        AB( A(Some(6), Some(6)),  B(Some(6),  Some(6.0)), Some(true))
        // @formatter:on
      )
    }
  }

}

case class A(a_col_1: Option[Int], a_col_2: Option[Float])

case class B(b_col_1: Option[Float], b_col_2: Option[Double])

case class AB(a: A, b: B, c: Option[Boolean])

Example 2

Source File: Cleaner.scala From cleanframes with Apache License 2.0

6 votes

package cleanframes

import org.apache.spark.sql.{Column, DataFrame, functions}
import shapeless.labelled.FieldType
import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness}

trait Cleaner[A] {
  def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column]
}

object Cleaner {
  def apply[A](frame: DataFrame, name: Option[String], alias: Option[String])(implicit env: Cleaner[A]): DataFrame = {
    frame.select(
      env.clean(frame, name, alias): _*
    )
  }

  def materialize[A](func: (DataFrame, Option[String], Option[String]) => List[Column]): Cleaner[A] = new Cleaner[A] {
    override def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] = func(frame, name, alias)
  }

  implicit val hnilCleaner: Cleaner[HNil] = materialize((_, _, _) => Nil)

  implicit def genericObjectCleaner[A, H <: HList](implicit
                                                   gen: LabelledGeneric.Aux[A, H],
                                                   hCleaner: Lazy[Cleaner[H]]): Cleaner[A] =
    materialize((frame, name, alias) => {
      val structColumn = functions.struct(
        hCleaner.value.clean(frame, name, alias): _*
      )

      List(
        alias
          .map(structColumn.as)
          .getOrElse(structColumn)
      )
    })

  implicit def hlistObjectCleaner[K <: Symbol, H, T <: HList](implicit
                                                              witness: Witness.Aux[K],
                                                              hCleaner: Lazy[Cleaner[H]],
                                                              tCleaner: Cleaner[T]): Cleaner[FieldType[K, H] :: T] = {
    val fieldName: String = witness.value.name

    materialize { (frame, name, alias) =>

      val columnName = alias match {
        case None |
             Some(`reserved_root_level_alias`) => fieldName
        case Some(alias) => s"$alias.$fieldName"
      }

      val hColumns = hCleaner.value.clean(frame, Some(columnName), alias = Some(fieldName))
      val tColumns = tCleaner.clean(frame, name, alias)
      hColumns ::: tColumns
    }
  }
}

Example 3

Source File: MergeClauseSuite.scala From spark-acid with Apache License 2.0

5 votes

package com.qubole.spark.hiveacid.merge

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.{AnalysisException, functions}

class MergeClauseSuite extends SparkFunSuite {
  def insertClause(addCondition : Boolean = true): MergeWhenNotInsert = {
    if (addCondition) {
      MergeWhenNotInsert(Some(functions.expr("x > 2").expr),
        Seq(functions.col("x").expr, functions.col("y").expr))
    }
    else {
      MergeWhenNotInsert(None,
        Seq(functions.col("x").expr, functions.col("y").expr))
    }
  }

  def updateClause(addCondition : Boolean = true): MergeWhenUpdateClause = {
    if (addCondition) {
      val updateCondition = Some(functions.expr("a > 2").expr)
      MergeWhenUpdateClause(updateCondition,
        Map("b" -> functions.lit(3).expr), isStar = false)
    } else {
      MergeWhenUpdateClause(None,
        Map("b" -> functions.lit(3).expr), isStar = false)
    }
  }

  def deleteClause(addCondition : Boolean = true): MergeWhenDelete = {
    if (addCondition) {
      MergeWhenDelete(Some(functions.expr("a < 1").expr))
    } else {
      MergeWhenDelete(None)
    }
  }

  test("Validate MergeClauses") {
    val clauses = Seq(insertClause(), updateClause(), deleteClause())
    MergeWhenClause.validate(clauses)
  }

  test("Invalid MergeClause cases") {
    val invalidMerge = "MERGE Validation Error: "

    //empty clauses
    checkInvalidMergeClause(invalidMerge + MergeWhenClause.atleastOneClauseError, Seq())

    // multi update or insert clauses
    val multiUpdateClauses = Seq(updateClause(), updateClause(), insertClause())
    checkInvalidMergeClause(invalidMerge + MergeWhenClause.justOneClausePerTypeError, multiUpdateClauses)

    // multi match clauses with first clause without condition
    val invalidMultiMatch = Seq(updateClause(false), deleteClause())
    checkInvalidMergeClause(invalidMerge + MergeWhenClause.matchClauseConditionError, invalidMultiMatch)

    // invalid Update Clause
    val invalidUpdateClause = MergeWhenUpdateClause(None, Map(), isStar = false)
    val thrown = intercept[IllegalArgumentException] {
      MergeWhenClause.validate(Seq(invalidUpdateClause))
    }
    assert(thrown.getMessage === "UPDATE Clause in MERGE should have one or more SET Values")
  }

  private def checkInvalidMergeClause(invalidMessage: String, multiUpdateClauses: Seq[MergeWhenClause]) = {
    val thrown = intercept[AnalysisException] {
      MergeWhenClause.validate(multiUpdateClauses)
    }
    assert(thrown.message === invalidMessage)
  }
}

Example 4

Source File: VectorExplodeSpec.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl

import odkl.analysis.spark.TestEnv
import odkl.analysis.spark.util.SQLOperations
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{functions, Row}
import org.apache.spark.sql.types.{StructType, StructField, DoubleType}
import org.scalatest.FlatSpec


class VectorExplodeSpec extends FlatSpec with TestEnv with org.scalatest.Matchers with SQLOperations with WithModels with HasMetricsBlock {

  case class Point(id: Int, vector: Vector, mean: Vector)

  lazy val data = sqlc.createDataFrame(Seq(
    Point(1, Vectors.dense(1.0, 3.0), Vectors.dense(10.0, 30.0)),
    Point(2, Vectors.dense(2.0, 4.0), Vectors.sparse(2, Array(1), Array(20.0)))
  ))

  lazy val withMetadata = data.withColumn(
    "vector",
    data("vector").as("vector", new AttributeGroup("vector", Array[Attribute](
      NumericAttribute.defaultAttr.withName("fixed"),
      NumericAttribute.defaultAttr.withName("var")
    )).toMetadata()))
    .withColumn(
      "mean",
      data("mean").as("mean", new AttributeGroup("vector", Array[Attribute](
        NumericAttribute.defaultAttr.withName("fixed"),
        NumericAttribute.defaultAttr.withName("var")
      )).toMetadata()))

  lazy val explode = new VectorExplode().transform(withMetadata)

  "Explode " should " add data" in {
    val result = explode.orderBy("id", "value").collect()

    result(0).getInt(0) should be(1)
    result(0).getString(1) should be("fixed")
    result(0).getDouble(2) should be(1.0)
    result(0).getDouble(3) should be(10.0)

    result(1).getInt(0) should be(1)
    result(1).getString(1) should be("var")
    result(1).getDouble(2) should be(3.0)
    result(1).getDouble(3) should be(30.0)

    result(2).getInt(0) should be(2)
    result(2).getString(1) should be("fixed")
    result(2).getDouble(2) should be(2.0)
    result(2).isNullAt(3) should be(true)

    result(3).getInt(0) should be(2)
    result(3).getString(1) should be("var")
    result(3).getDouble(2) should be(4.0)
    result(3).getDouble(3) should be(20.0)
  }

  "Explode " should " create schema" in {
    val fields = explode.schema.fields

    fields(0).name should be("id")
    fields(1).name should be("value")
    fields(2).name should be("vector")
    fields(3).name should be("mean")
  }
}

Example 5

Source File: SimpleReproContext.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.repro

import org.apache.spark.ml.param.{Param, ParamPair, Params}
import org.apache.spark.ml.util.MLWritable
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession, functions}

class SimpleReproContext private
(spark: SparkSession, basePath: String, tags: Seq[(String,String)]) extends ReproContext {

  def this(basePath: String)(implicit spark: SparkSession) = this(spark, basePath, Seq())

  var accumulatedMetrics : Seq[DataFrame] = Seq()

  var accumulatedParams: Seq[(Seq[String], Iterable[ParamPair[_]])] = Seq()

  override def persistEstimator(estimator: MLWritable): Unit = {
    estimator.save(basePath + "/estimator")
  }

  override def persistModel(model: MLWritable): Unit = {
    model.save(basePath + "/model")
  }

  override def dive(tags: Seq[(String, String)]): ReproContext = new SimpleReproContext(
    spark, basePath, this.tags ++ tags)

  override def logParamPairs(params: Iterable[ParamPair[_]], path: Seq[String]): Unit =
    accumulatedParams = accumulatedParams :+ path -> params

  override def logMetircs(metrics: => DataFrame): Unit = accumulatedMetrics = accumulatedMetrics :+ metrics

  override def start(): Unit = {
    import spark.implicits._
    accumulatedParams.map {
      case (path, params) => params.view
        .map(x => x.param.name -> x.param.asInstanceOf[Param[Any]].jsonEncode(x.value))
        .toSeq
        .toDF("param", "value")
        .withColumn("path", functions.lit(path.mkString("/")))
    }.reduce(_ unionByName _)
      .write.parquet(taggedPrefix + "/params")
  }

  override def finish(): Unit = {
    accumulatedMetrics.reduceOption(_ unionByName _).foreach(
      _.write.parquet(taggedPrefix + "/metrics"))
  }

  private def taggedPrefix: String = {
    tags.map(x => x._1 + "=" + x._2).mkString(basePath + "/", "/", "")
  }
}

Example 6

Source File: VectorExplode.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl


import odkl.analysis.spark.util.collection.OpenHashMap
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.odkl.SparkSqlUtils
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, functions}


class VectorExplode(override val uid: String) extends
  Transformer with DefaultParamsWritable {

  val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.")

  def setValueCol(value: String) : this.type = set(valueCol, value)

  setDefault(valueCol -> "value")


  def this() = this(Identifiable.randomUID("vectorExplode"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT])

    val resultSchema = StructType(Seq(
      StructField($(valueCol), StringType, nullable = false)) ++
      vectors.map(f => StructField(f.name, DoubleType, nullable = true))
    )

    val arraySize = resultSchema.size - 1

    val names: Array[Map[Int, String]] = vectors.map(
      f => {
        AttributeGroup.fromStructField(f).attributes
          .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap)
          .getOrElse(Map())
      })

    val maxCapacity = names.map(_.size).max

    val explodeVectors : (Row => Array[Row]) = (r: Row ) => {
      val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity)

      for(i <- 0 until r.length) {
        val vector = r.getAs[Vector](i)

        vector.foreachActive((index, value) => {
          val name = names(i).getOrElse(index, s"${vectors(i).name}_$index")

          accumulator.changeValue(
            name,
            Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN},
            v => {v(i) = value; v})
        })
      }

      accumulator.map(x => new GenericRowWithSchema(
        (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray,
        resultSchema)).toArray
    }

    val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*)
    val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType)))
        
    val expression = functions.explode(explodeUDF(vectorsStruct))

    dataset
      .withColumn(uid, expression)
      .select(
        dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++
          resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*)
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(schema.fields.map(x =>
      x.dataType match {
        case vector: VectorUDT => StructField(x.name, typeFromVector(x))
        case _ => x
      }
    ))

  def typeFromVector(field: StructField): StructType = {
    val attributes = AttributeGroup.fromStructField(field)
    StructType(attributes.attributes
      .map(_.map(a => a.name.getOrElse(s"_${a.index.get}")))
      .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" })
      .map(name => StructField(name, DoubleType, nullable = false)))
  }
}

Example 7

Source File: HasConfigurations.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.hyperopt
import org.apache.spark.ml.odkl.ModelWithSummary
import org.apache.spark.ml.odkl.ModelWithSummary.Block
import org.apache.spark.ml.param.{Param, Params}
import org.apache.spark.repro.MetricsExtractor
import org.apache.spark.repro.ReproContext.logMetircs
import org.apache.spark.sql.{DataFrame, functions}


trait HasConfigurations extends Params with MetricsExtractor {
  val configurations: Block = Block("configurations")

  val configurationIndexColumn = new Param[String](this, "configurationIndexColumn",
    "Name of the column to store id of config for further analysis.")
  val resultingMetricColumn = new Param[String](this, "resultingMetricColumn",
    "Name of the column to store resulting metrics for further analysis.")
  val errorColumn = new Param[String](this, "errorColumn",
    "Name of the column to store text of the error if occurs.")

  def getConfigurationIndexColumn: String = $(configurationIndexColumn)

  def setConfigurationIndexColumn(value: String): this.type = set(configurationIndexColumn, value)

  def getResultingMetricColumn: String = $(resultingMetricColumn)

  def setResultingMetricColumn(value: String): this.type = set(resultingMetricColumn, value)

  def getErrorColumn: String = $(errorColumn)

  def setErrorColumn(value: String): this.type = set(errorColumn, value)

  setDefault(
    configurationIndexColumn -> "configurationIndex",
    resultingMetricColumn -> "resultingMetric",
    errorColumn -> "error"
  )


  protected def extractImpl(model: ModelWithSummary[_]) : Option[DataFrame] = {
    // Report only resulting metrics to the context assuming that detailed metrics
    // where reported by forks.
    model.summary.blocks.get(configurations).map(data => data.select(
        data(getConfigurationIndexColumn).as("invertedStep"),
        data(getResultingMetricColumn).as("value"),
        functions.lit("target").as("metric")
      )
    )
  }
}

Example 8

Source File: NameAssigner.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasInputCols
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Dataset, functions}
import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType}


class NameAssigner(override val uid: String) extends Transformer with HasInputCols{

  def setInputCols(column: String*) : this.type = set(inputCols, column.toArray)

  def this() = this(Identifiable.randomUID("NameAssigner"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    $(inputCols)

    $(inputCols).foldLeft(dataset.toDF)((data, column) => {
      val metadata: Metadata = dataset.schema(column).metadata
      val attributes = AttributeGroup.fromStructField(
        StructField(column, new VectorUDT, nullable = false, metadata = metadata))

      val map = attributes.attributes
        .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap)
        .getOrElse(Map())

      val func = functions.udf[String, Number](x => if(x == null) {
        null
      } else {
        val i = x.intValue()
        map.getOrElse(i, i.toString)
      })

      data.withColumn(column, func(data(column)).as(column, metadata))
    }).toDF
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(schema.map(f => if ($(inputCols).contains(f.name)) {
      StructField(f.name, StringType, f.nullable, f.metadata)
    } else {
      f
    }))
}

Example 9

Source File: XGBoostUtils.scala From pravda-ml with Apache License 2.0

5 votes

package ml.dmlc.xgboost4j.scala.spark

import ml.dmlc.xgboost4j.scala.Booster
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.{BooleanParam, Params}
import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
import org.apache.spark.sql.{Dataset, functions}



object XGBoostUtils {
  def getBooster(x: XGBoostClassificationModel): Booster = x._booster

  def getBooster(x: XGBoostRegressionModel): Booster = x._booster
}

trait OkXGBoostParams  extends HasFeaturesCol with HasPredictionCol {
  this: Params =>

  val densifyInput = new BooleanParam(this, "densifyInput",
    "In order to fix the difference between spark abd xgboost sparsity treatment")
  val predictAsDouble = new BooleanParam(this, "predictAsDouble",
    "Whenver to cast XGBoost prediction to double matching common behavior for other predictors.")
  val addRawTrees = new BooleanParam(this, "addRawTrees",
    "Whenever to add raw trees block to model summary.")
  val addSignificance = new BooleanParam(this, "addSignificance",
    "Whenever to add feature significance block to model summary.")

  def setAddSignificance(value: Boolean): this.type = set(addSignificance, value)

  def setAddRawTrees(value: Boolean): this.type = set(addRawTrees, value)

  def setDensifyInput(value: Boolean): this.type = set(densifyInput, value)

  def setPredictAsDouble(value: Boolean): this.type = set(predictAsDouble, value)

  protected def densifyIfNeeded(dataset: Dataset[_]) : Dataset[_] = {
    if ($(densifyInput)) {
      val densify = functions.udf((x: Vector) => x.toDense)
      val col = getFeaturesCol
      val metadata = dataset.schema(col).metadata

      dataset.withColumn(
        col,
        densify(dataset(col)).as(col, metadata))
    } else {
      dataset
    }
  }
}

trait OkXGBoostClassifierParams extends XGBoostClassifierParams with OkXGBoostParams

trait OkXGBoostRegressorParams extends XGBoostRegressorParams with OkXGBoostParams

Example 10

Source File: SparkTableTest.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.impl

import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, functions}
import org.opencypher.morpheus.impl.table.SparkTable.{DataFrameTable, _}
import org.opencypher.morpheus.testing.MorpheusTestSuite
import org.opencypher.okapi.testing.Bag
import org.opencypher.okapi.testing.Bag._
import org.scalatest.Matchers
import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks

import scala.collection.mutable.WrappedArray.ofLong

class SparkTableTest extends MorpheusTestSuite with Matchers with ScalaCheckDrivenPropertyChecks {
  import morpheus.sparkSession.sqlContext.implicits._

  it("it should cast integer columns to long") {

    val df = sparkSession.createDataFrame(List(
      Row(1, 2L, Array(42), Array(42), Array(42L), Row(42, 42L))
    ).asJava, StructType(Seq(
      StructField("a", IntegerType, nullable = true),
      StructField("b", LongType, nullable = false),
      StructField("c", ArrayType(IntegerType, containsNull = true), nullable = true),
      StructField("d", ArrayType(IntegerType, containsNull = false), nullable = false),
      StructField("e", ArrayType(LongType, containsNull = false), nullable = false),
      StructField("f", StructType(Seq(
        StructField("foo", IntegerType, true),
        StructField("bar", LongType, false)
      )), nullable = true)
    )))

    val updatedDf = df.castToLong

    updatedDf.schema should equal(StructType(Seq(
      StructField("a", LongType, nullable = true),
      StructField("b", LongType, nullable = false),
      StructField("c", ArrayType(LongType, containsNull = true), nullable = true),
      StructField("d", ArrayType(LongType, containsNull = false), nullable = false),
      StructField("e", ArrayType(LongType, containsNull = false), nullable = false),
      StructField("f", StructType(Seq(
        StructField("foo", LongType, true),
        StructField("bar", LongType, true)
      )), nullable = false)
    )))

    updatedDf.collect().toBag should equal(Bag(
      Row(1L, 2L, new ofLong(Array(42L)), new ofLong(Array(42L)), new ofLong(Array(42L)), Row(42L, 42L))
    ))
  }

  // These tests verifies that https://issues.apache.org/jira/browse/SPARK-26572 is still fixed
  describe("distinct workaround") {
    it("detects if the Spark bug is still fixed") {
      val baseTable = Seq(1, 1).toDF("idx")

      // Uses Spark distinct
      val distinctWithId = baseTable.distinct.withColumn("id", functions.monotonically_increasing_id())

      val monotonicallyOnLeft = distinctWithId.join(baseTable, "idx")

      // Bug in Spark: "monotonically_increasing_id" is pushed down when it shouldn't be. Push down only happens when the
      // DF containing the "monotonically_increasing_id" expression is on the left side of the join.
      monotonicallyOnLeft.select("id").collect().map(_.get(0)).distinct.length shouldBe 1
    }

  }

}

Example 11

Source File: EncodeLongTest.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.impl.encoders

import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
import org.apache.spark.sql.catalyst.expressions.{Alias, GenericInternalRow}
import org.apache.spark.sql.functions
import org.apache.spark.sql.functions.typedLit
import org.opencypher.morpheus.api.value.MorpheusElement._
import org.opencypher.morpheus.impl.expressions.EncodeLong
import org.opencypher.morpheus.impl.expressions.EncodeLong._
import org.opencypher.morpheus.testing.MorpheusTestSuite
import org.scalatestplus.scalacheck.Checkers

class EncodeLongTest extends MorpheusTestSuite with Checkers {

  it("encodes longs correctly") {
    check((l: Long) => {
      val scala = l.encodeAsMorpheusId.toList
      val spark = typedLit[Long](l).encodeLongAsMorpheusId.expr.eval().asInstanceOf[Array[Byte]].toList
      scala === spark
    }, minSuccessful(1000))
  }

  it("encoding/decoding is symmetric") {
    check((l: Long) => {
      val encoded = l.encodeAsMorpheusId
      val decoded = decodeLong(encoded)
      decoded === l
    }, minSuccessful(1000))
  }

  it("scala version encodes longs correctly") {
    0L.encodeAsMorpheusId.toList should equal(List(0.toByte))
  }

  it("spark version encodes longs correctly") {
    typedLit[Long](0L).encodeLongAsMorpheusId.expr.eval().asInstanceOf[Array[Byte]].array.toList should equal(List(0.toByte))
  }

  describe("Spark expression") {

    it("converts longs into byte arrays using expression interpreter") {
      check((l: Long) => {
        val positive = l & Long.MaxValue
        val inputRow = new GenericInternalRow(Array[Any](positive))
        val encodeLong = EncodeLong(functions.lit(positive).expr)
        val interpreted = encodeLong.eval(inputRow).asInstanceOf[Array[Byte]]
        val decoded = decodeLong(interpreted)

        decoded === positive
      }, minSuccessful(1000))
    }

    it("converts longs into byte arrays using expression code gen") {
      check((l: Long) => {
        val positive = l & Long.MaxValue
        val inputRow = new GenericInternalRow(Array[Any](positive))
        val encodeLong = EncodeLong(functions.lit(positive).expr)
        val plan = GenerateMutableProjection.generate(Alias(encodeLong, s"Optimized($encodeLong)")() :: Nil)
        val codegen = plan(inputRow).get(0, encodeLong.dataType).asInstanceOf[Array[Byte]]
        val decoded = decodeLong(codegen)

        decoded === positive
      }, minSuccessful(1000))
    }

  }

}

Example 12

Source File: DataFrameOutputExample.scala From morpheus with Apache License 2.0

5 votes

// tag::full-example[]
package org.opencypher.morpheus.examples

import org.apache.spark.sql.{DataFrame, functions}
import org.opencypher.morpheus.api.MorpheusSession
import org.opencypher.morpheus.api.MorpheusSession._
import org.opencypher.morpheus.util.App
import org.opencypher.okapi.api.graph.CypherResult


object DataFrameOutputUsingAliasExample extends App {
  // 1) Create Morpheus session and retrieve Spark session
  implicit val morpheus: MorpheusSession = MorpheusSession.local()

  // 2) Load social network data via case class instances
  val socialNetwork = morpheus.readFrom(SocialNetworkData.persons, SocialNetworkData.friendships)

  // 3) Query graph with Cypher
  val results = socialNetwork.cypher(
    """|MATCH (a:Person)-[r:FRIEND_OF]->(b)
       |RETURN a.name AS person1, b.name AS person2, r.since AS friendsSince""".stripMargin)

  // 4) Extract DataFrame representing the query result
  val df: DataFrame = results.records.asDataFrame

  // 5) Select aliased return items from the query result
  val projection: DataFrame = df
    .select("person1", "friendsSince", "person2")
    .orderBy(functions.to_date(df.col("friendsSince"), "dd/mm/yyyy"))

  projection.show()
}
// end::full-example[]

Example 13

Source File: YelpHelpers.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.integration.yelp

import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType}
import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions}
import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey
import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey}
import org.opencypher.morpheus.impl.table.SparkTable._
import org.opencypher.morpheus.integration.yelp.YelpConstants._

object YelpHelpers {

  case class YelpTables(
    userDf: DataFrame,
    businessDf: DataFrame,
    reviewDf: DataFrame
  )

  def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = {
    import spark.implicits._

    log("read business.json", 2)
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")
    log("read review.json", 2)
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")
    log("read user.json", 2)
    val rawUserDf = spark.read.json(s"$inputPath/user.json")

    val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state")
    val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType))
    val userDf = rawUserDf.select(
      $"user_id".as(sourceIdKey),
      $"name",
      $"yelping_since".cast(DateType),
      functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite"))

    YelpTables(userDf, businessDf, reviewDf)
  }

  def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = {
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")

    import spark.implicits._

    rawBusinessDf.select($"city", $"state").distinct().show()
    rawBusinessDf.withColumnRenamed("business_id", "id")
      .join(rawReviewDf, $"id" === $"business_id")
      .groupBy($"city", $"state")
      .count().as("count")
      .orderBy($"count".desc, $"state".asc)
      .show(100)
  }

  def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = {
    import spark.implicits._

    def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com"))

    val rawUserDf = spark.read.json(s"$inputPath/user.json")
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")

    val businessDf = rawBusinessDf.filter($"city" === city)
    val reviewDf = rawReviewDf
      .join(businessDf, Seq("business_id"), "left_semi")
      .withColumn("user_email", emailColumn("user_id"))
      .withColumnRenamed("stars", "stars_tmp")
      .withColumn("stars", $"stars_tmp".cast(IntegerType))
      .drop("stars_tmp")
    val userDf = rawUserDf
      .join(reviewDf, Seq("user_id"), "left_semi")
      .withColumn("email", emailColumn("user_id"))
    val friendDf = userDf
      .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id"))
      .withColumn("user2_email", emailColumn("user2_id"))
      .select(s"user1_email", s"user2_email")

    businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json")
    reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json")
    userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json")
    friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json")
  }

  implicit class DataFrameOps(df: DataFrame) {
    def prependIdColumn(idColumn: String, prefix: String): DataFrame =
      df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn))
  }
}

Example 14

Source File: EdgeListDataSource.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.api.io.edgelist

import org.apache.spark.sql.functions
import org.apache.spark.sql.types.{LongType, StructField, StructType}
import org.opencypher.morpheus.api.MorpheusSession
import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey
import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey}
import org.opencypher.morpheus.api.io.edgelist.EdgeListDataSource._
import org.opencypher.morpheus.api.io.{MorpheusNodeTable, MorpheusRelationshipTable}
import org.opencypher.morpheus.schema.MorpheusSchema
import org.opencypher.okapi.api.graph.{GraphName, PropertyGraph}
import org.opencypher.okapi.api.io.PropertyGraphDataSource
import org.opencypher.okapi.api.schema.{PropertyGraphSchema, PropertyKeys}
import org.opencypher.okapi.impl.exception.UnsupportedOperationException

object EdgeListDataSource {

  val NODE_LABEL = "V"

  val REL_TYPE = "E"

  val GRAPH_NAME = GraphName("graph")

  val SCHEMA: PropertyGraphSchema = MorpheusSchema.empty
    .withNodePropertyKeys(Set(NODE_LABEL), PropertyKeys.empty)
    .withRelationshipPropertyKeys(REL_TYPE, PropertyKeys.empty)
}


case class EdgeListDataSource(path: String, options: Map[String, String] = Map.empty)(implicit morpheus: MorpheusSession)
  extends PropertyGraphDataSource {

  override def hasGraph(name: GraphName): Boolean = name == GRAPH_NAME

  override def graph(name: GraphName): PropertyGraph = {
    val reader = options.foldLeft(morpheus.sparkSession.read) {
      case (current, (key, value)) => current.option(key, value)
    }

    val rawRels = reader
      .schema(StructType(Seq(
        StructField(sourceStartNodeKey, LongType),
        StructField(sourceEndNodeKey, LongType))))
      .csv(path)
      .withColumn(sourceIdKey, functions.monotonically_increasing_id())
      .select(sourceIdKey, sourceStartNodeKey, sourceEndNodeKey)

    val rawNodes = rawRels
      .select(rawRels.col(sourceStartNodeKey).as(sourceIdKey))
      .union(rawRels.select(rawRels.col(sourceEndNodeKey).as(sourceIdKey)))
      .distinct()

    morpheus.graphs.create(MorpheusNodeTable(Set(NODE_LABEL), rawNodes), MorpheusRelationshipTable(REL_TYPE, rawRels))
  }

  override def schema(name: GraphName): Option[PropertyGraphSchema] = Some(SCHEMA)

  override def store(name: GraphName, graph: PropertyGraph): Unit =
    throw UnsupportedOperationException("Storing an edge list is not supported")

  override def delete(name: GraphName): Unit =
    throw UnsupportedOperationException("Deleting an edge list is not supported")

  override val graphNames: Set[GraphName] = Set(GRAPH_NAME)
}

Example 15

Source File: ConcatColumnBenchmark.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.jmh

import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{DataFrame, SparkSession, functions}
import org.apache.spark.storage.StorageLevel
import org.opencypher.morpheus.impl.MorpheusFunctions
import org.opencypher.morpheus.impl.expressions.EncodeLong._
import org.openjdk.jmh.annotations._

@State(Scope.Benchmark)
@BenchmarkMode(Array(Mode.AverageTime))
class ConcatColumnBenchmark {

  implicit var sparkSession: SparkSession = _

  var df: DataFrame = _

  @Setup
  def setUp(): Unit = {
    sparkSession = SparkSession.builder().master("local[*]").getOrCreate()
    val fromRow = 100000000L
    val numRows = 1000000
    val rangeDf = sparkSession.range(fromRow, fromRow + numRows).toDF("i")
    val indexCol = rangeDf.col("i")
    df = rangeDf
      .withColumn("s", indexCol.cast(StringType))
      .withColumn("b", indexCol.encodeLongAsMorpheusId)
      .partitionAndCache
  }

  @Benchmark
  def concatWs(): Int = {
    val result = df.withColumn("c", functions.concat_ws("|", df.col("i"), df.col("s"), df.col("b")))
    result.select("c").collect().length
  }

  @Benchmark
  def serialize(): Int = {
    val result = df.withColumn("c", MorpheusFunctions.serialize(df.col("i"), df.col("s"), df.col("b")))
    result.select("c").collect().length
  }

  implicit class DataFrameSetup(df: DataFrame) {

    def partitionAndCache: DataFrame = {
      val cached = df.repartition(10).persist(StorageLevel.MEMORY_ONLY)
      cached.count()
      cached
    }
  }

}

Example 16

Source File: DeleteScalaSuite.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta

import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
import io.delta.tables.{DeltaTable, DeltaTableTestUtils}

import org.apache.spark.sql.{functions, Row}

class DeleteScalaSuite extends DeleteSuiteBase with DeltaSQLCommandTest {

  import testImplicits._

  test("delete usage test - without condition") {
    append(Seq((1, 10), (2, 20), (3, 30), (4, 40)).toDF("key", "value"))
    val table = io.delta.tables.DeltaTable.forPath(tempPath)
    table.delete()
    checkAnswer(readDeltaTable(tempPath), Nil)
  }

  test("delete usage test - with condition") {
    append(Seq((1, 10), (2, 20), (3, 30), (4, 40)).toDF("key", "value"))
    val table = io.delta.tables.DeltaTable.forPath(tempPath)
    table.delete("key = 1 or key = 2")
    checkAnswer(readDeltaTable(tempPath), Row(3, 30) :: Row(4, 40) :: Nil)
  }

  test("delete usage test - with Column condition") {
    append(Seq((1, 10), (2, 20), (3, 30), (4, 40)).toDF("key", "value"))
    val table = io.delta.tables.DeltaTable.forPath(tempPath)
    table.delete(functions.expr("key = 1 or key = 2"))
    checkAnswer(readDeltaTable(tempPath), Row(3, 30) :: Row(4, 40) :: Nil)
  }

  override protected def executeDelete(target: String, where: String = null): Unit = {

    def parse(tableNameWithAlias: String): (String, Option[String]) = {
      tableNameWithAlias.split(" ").toList match {
        case tableName :: Nil => tableName -> None // just table name
        case tableName :: alias :: Nil => // tablename SPACE alias OR tab SPACE lename
          val ordinary = (('a' to 'z') ++ ('A' to 'Z') ++ ('0' to '9')).toSet
          if (!alias.forall(ordinary.contains(_))) {
            (tableName + " " + alias) -> None
          } else {
            tableName -> Some(alias)
          }
        case _ =>
          fail(s"Could not build parse '$tableNameWithAlias' for table and optional alias")
      }
    }

    val deltaTable: DeltaTable = {
      val (tableNameOrPath, optionalAlias) = parse(target)
      val isPath: Boolean = tableNameOrPath.startsWith("delta.")
      val table = if (isPath) {
        val path = tableNameOrPath.stripPrefix("delta.`").stripSuffix("`")
        io.delta.tables.DeltaTable.forPath(spark, path)
      } else {
        DeltaTableTestUtils.createTable(spark.table(tableNameOrPath),
          DeltaLog.forTable(spark, tableNameOrPath))
      }
      optionalAlias.map(table.as(_)).getOrElse(table)
    }

    if (where != null) {
      deltaTable.delete(where)
    } else {
      deltaTable.delete()
    }
  }
}

Example 17

Source File: DeltaTableOperations.scala From delta with Apache License 2.0

5 votes

package io.delta.tables.execution

import scala.collection.Map

import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate}
import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand}
import org.apache.spark.sql.delta.util.AnalysisHelper
import io.delta.tables.DeltaTable

import org.apache.spark.sql.{functions, Column, DataFrame, Dataset}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression}
import org.apache.spark.sql.catalyst.plans.logical._


trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable =>

  protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError {
    val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition)
    toDataset(sparkSession, delete)
  }

  protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = {
    val history = new DeltaHistoryManager(deltaLog)
    val spark = self.toDF.sparkSession
    spark.createDataFrame(history.getHistory(limit))
  }

  protected def executeGenerate(tblIdentifier: String, mode: String): Unit = {
    val tableId: TableIdentifier = sparkSession
      .sessionState
      .sqlParser
      .parseTableIdentifier(tblIdentifier)
    val generate = DeltaGenerateCommand(mode, tableId)
    generate.run(sparkSession)
  }

  protected def executeUpdate(
      set: Map[String, Column],
      condition: Option[Column]): Unit = improveUnsupportedOpError {
    val assignments = set.map { case (targetColName, column) =>
      Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr)
    }.toSeq
    val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr))
    toDataset(sparkSession, update)
  }

  protected def executeVacuum(
      deltaLog: DeltaLog,
      retentionHours: Option[Double]): DataFrame = {
    VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours)
    sparkSession.emptyDataFrame
  }

  protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = {
    map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap
  }

  protected def sparkSession = self.toDF.sparkSession
}

Example 18

Source File: DataFrameUtils.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.analytics.util

import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, functions}
import org.slf4j.{Logger, LoggerFactory}


object DataFrameUtils {

  private val logger: Logger = LoggerFactory.getLogger(getClass)

  type FilterFunction = Row => Boolean

  type PartitionCriteria = Seq[(String, String)]

  def mapPartitionsToDirectories(partitionCriteria: PartitionCriteria): Seq[String] = {
    partitionCriteria.map {
      case (columnName, columnValue) => s"$columnName=$columnValue"
    }
  }

  def buildPartitionsCriteriaMatcherFunc(multiplePartitionsCriteria: Seq[PartitionCriteria], schema: StructType): FilterFunction = {
    val targetPartitions = multiplePartitionsCriteria.flatten.map(_._1).toSet
    val fieldNameToMatchFunctionMapping = schema.fields.filter {
      case StructField(name, _, _, _) => targetPartitions.contains(name)
    }.map {
      case StructField(name, _: ByteType, _, _)    => name -> ((r: Row, value: String) => r.getAs[Byte](name)    == value.toByte)
      case StructField(name, _: ShortType, _, _)   => name -> ((r: Row, value: String) => r.getAs[Short](name)   == value.toShort)
      case StructField(name, _: IntegerType, _, _) => name -> ((r: Row, value: String) => r.getAs[Int](name)     == value.toInt)
      case StructField(name, _: LongType, _, _)    => name -> ((r: Row, value: String) => r.getAs[Long](name)    == value.toLong)
      case StructField(name, _: FloatType, _, _)   => name -> ((r: Row, value: String) => r.getAs[Float](name)   == value.toFloat)
      case StructField(name, _: DoubleType, _, _)  => name -> ((r: Row, value: String) => r.getAs[Double](name)  == value.toDouble)
      case StructField(name, _: BooleanType, _, _) => name -> ((r: Row, value: String) => r.getAs[Boolean](name) == value.toBoolean)
      case StructField(name, _: StringType, _, _)  => name -> ((r: Row, value: String) => r.getAs[String](name)  == value)
    }.toMap

    def convertPartitionCriteriaToFilterFunctions(partitionCriteria: PartitionCriteria): Seq[FilterFunction] = partitionCriteria.map {
      case (name, value) => (row: Row) => fieldNameToMatchFunctionMapping(name)(row, value)
    }

    def joinSinglePartitionFilterFunctionsWithAnd(partitionFilterFunctions: Seq[FilterFunction]): FilterFunction =
      partitionFilterFunctions
        .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) && predicate2(row))
        .getOrElse((_: Row) => false)

    multiplePartitionsCriteria
      .map(convertPartitionCriteriaToFilterFunctions)
      .map(joinSinglePartitionFilterFunctionsWithAnd)
      .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) || predicate2(row))
      .getOrElse((_: Row) => false)
  }


  implicit class DataFrameHelper(df: DataFrame) {

    def collectPartitions(targetPartitions: Seq[String]): Seq[PartitionCriteria] = {
      logger.info(s"Collecting unique partitions for partitions columns (${targetPartitions.mkString(", ")})")
      val partitions = df.selectExpr(targetPartitions: _*).distinct().collect()

      partitions.map { row =>
        targetPartitions.map { columnName =>
          Option(row.getAs[Any](columnName)) match {
            case Some(columnValue) => columnName -> columnValue.toString
            case None => throw new RuntimeException(s"Partition column '$columnName' contains null value")
          }
        }
      }
    }

    def addMissingColumns(targetSchema: StructType): DataFrame = {
      val dataFieldsSet = df.schema.fieldNames.toSet
      val selectColumns = targetSchema.fields.map { field =>
        if (dataFieldsSet.contains(field.name)) {
          functions.col(field.name)
        } else {
          functions.lit(null).cast(field.dataType).as(field.name)
        }
      }
      df.select(selectColumns: _*)
    }

    def isEmpty: Boolean = df.head(1).isEmpty

    def nonEmpty: Boolean = df.head(1).nonEmpty
  }
}