org.apache.spark.sql.Encoder Scala Examples

The following examples show how to use org.apache.spark.sql.Encoder. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: RichSparkFunctions.scala    From lighthouse   with Apache License 2.0 5 votes vote down vote up
package be.dataminded.lighthouse.pipeline

import com.typesafe.scalalogging.LazyLogging
import org.apache.spark.sql.{Dataset, Encoder}
import org.apache.spark.storage.StorageLevel

import scala.reflect.ClassTag

object RichSparkFunctions extends LazyLogging {

  class DatasetSparkFunction[A <: Dataset[_]: ClassTag](function: SparkFunction[A]) {

    
    def printSchema(): SparkFunction[A] =
      function.map { dataSet =>
        dataSet.printSchema()
        dataSet
      }

    def as[T: Encoder]: SparkFunction[Dataset[T]] = function.map(_.as[T])

    def cache(storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): SparkFunction[A] =
      function.map {
        _.persist(storageLevel)
      }

    def dropCache(): SparkFunction[A] =
      function.map {
        _.unpersist()
      }

    def write(sink: Sink, sinks: Sink*): SparkFunction[A] = {
      if (sinks.isEmpty) function.map { data =>
        sink.write(data); data
      }
      else (sink +: sinks).foldLeft(function.cache())((f, sink) => f.write(sink))
    }

    def count(): SparkFunction[Long] = {
      function.map { dataSet =>
        val n = dataSet.count()
        logger.debug(s"The data set produced $n rows")
        n
      }
    }
  }
} 
Example 2
Source File: ReduceAggregator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 3
Source File: Aggregator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 4
Source File: typedaggregators.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this(x => f.call(x))
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 5
Source File: TopByKeyAggregator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.recommendation

import scala.language.implicitConversions
import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.{Encoder, Encoders}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.util.BoundedPriorityQueue



private[recommendation] class TopByKeyAggregator[K1: TypeTag, K2: TypeTag, V: TypeTag]
  (num: Int, ord: Ordering[(K2, V)])
  extends Aggregator[(K1, K2, V), BoundedPriorityQueue[(K2, V)], Array[(K2, V)]] {

  override def zero: BoundedPriorityQueue[(K2, V)] = new BoundedPriorityQueue[(K2, V)](num)(ord)

  override def reduce(
      q: BoundedPriorityQueue[(K2, V)],
      a: (K1, K2, V)): BoundedPriorityQueue[(K2, V)] = {
    q += {(a._2, a._3)}
  }

  override def merge(
      q1: BoundedPriorityQueue[(K2, V)],
      q2: BoundedPriorityQueue[(K2, V)]): BoundedPriorityQueue[(K2, V)] = {
    q1 ++= q2
  }

  override def finish(r: BoundedPriorityQueue[(K2, V)]): Array[(K2, V)] = {
    r.toArray.sorted(ord.reverse)
  }

  override def bufferEncoder: Encoder[BoundedPriorityQueue[(K2, V)]] = {
    Encoders.kryo[BoundedPriorityQueue[(K2, V)]]
  }

  override def outputEncoder: Encoder[Array[(K2, V)]] = ExpressionEncoder[Array[(K2, V)]]()
} 
Example 6
Source File: ReduceAggregator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  @transient private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 7
Source File: Aggregator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 8
Source File: typedaggregators.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any])
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 9
Source File: Aggregator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
import org.apache.spark.sql.{DataFrame, Dataset, Encoder, TypedColumn}


  def toColumn(
      implicit bEncoder: Encoder[B],
      cEncoder: Encoder[O]): TypedColumn[I, O] = {
    val expr =
      new AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        false)

    new TypedColumn[I, O](expr, encoderFor[O])
  }
} 
Example 10
Source File: A_9_MyAverageByAggregator.scala    From wow-spark   with MIT License 5 votes vote down vote up
package com.sev7e0.wow.sql

import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.expressions.Aggregator



case class Employee(name:String, salary:Long)
case class Average(var sum:Long, var count:Long)
object A_9_MyAverageByAggregator extends Aggregator[Employee, Average, Double]{
  override def zero: Average = Average(0L,0L)

  override def reduce(b: Average, a: Employee): Average = {
    b.sum += a.salary
    b.count+=1
    b
  }

  override def merge(b1: Average, b2: Average): Average = {
    b1.count+=b2.count
    b1.sum+=b2.sum
    b1
  }

  override def finish(reduction: Average): Double = reduction.sum.toDouble/reduction.count

  override def bufferEncoder: Encoder[Average] = Encoders.product

  override def outputEncoder: Encoder[Double] = Encoders.scalaDouble

  def main(args: Array[String]): Unit = {
    val sparkSession = SparkSession.builder().master("local").appName("MyAverageByAggregator")
      .getOrCreate()
    //隐式转换
    import sparkSession.implicits._
    val dataFrame = sparkSession.read.json("src/main/resources/sparkresource/employees.json").as[Employee]
    dataFrame.show()

    val salary_average = A_9_MyAverageByAggregator.toColumn.name("salary_average")

    val frame = dataFrame.select(salary_average)
    frame.show()
  }
} 
Example 11
Source File: JCUDACodegenIterator.scala    From GPUEnabler   with Apache License 2.0 5 votes vote down vote up
package com.ibm.gpuenabler

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.Encoder


abstract class JCUDACodegenIterator extends Iterator[InternalRow] {
  def hasNext() : Boolean
  def next() : InternalRow
  def init[T](itr : java.util.Iterator[InternalRow], 
              args: Array[Any],
              size : Int,
              cached: Int, 
              gpuPtrs: java.util.List[java.util.Map[String, CachedGPUMeta]], 
              blockID: Int,
              userGridSizes: Array[Array[Int]], 
              userBlockSizes: Array[Array[Int]], 
              stages: Int, 
              smSize: Int,
              inpEnc: Encoder[T])
} 
Example 12
Source File: SparkEgressSpec.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark

import org.apache.spark.sql.{ Dataset, Encoder, SparkSession }
import org.apache.spark.sql.streaming.{ OutputMode, Trigger }
import cloudflow.streamlets.StreamletShape
import cloudflow.streamlets.avro._
import cloudflow.spark.avro._
import cloudflow.spark.testkit._
import cloudflow.spark.sql.SQLImplicits._

class SparkEgressSpec extends SparkScalaTestSupport {
  "SparkEgress" should {
    "materialize streaming data to sink" in {

      val testKit = SparkStreamletTestkit(session)

      def asCollection[T: Encoder](session: SparkSession, queryName: String): List[T] =
        session.sql(s"select * from $queryName").as[T].collect().toList

      val instance = new MySparkEgress()

      // setup inlet tap on inlet port
      val in: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in)

      // build data and send to inlet tap
      val data = (1 to 10).map(i ⇒ Data(i, s"name$i"))
      in.addData(data)

      val run = testKit.run(instance, Seq(in), Seq.empty)
      run.failures mustBe ('empty)
      run.totalRows mustBe (20)
      val r1 = asCollection[String](session, "allNames")
      val r2 = asCollection[String](session, "allNamesUpper")

      // assert
      r1 must contain("name1")
      r2 must contain("NAME1")
    }
  }
}

class MySparkEgress extends SparkStreamlet {
  val in    = AvroInlet[Data]("in")
  val shape = StreamletShape(in)
  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries =
      process(readStream(in))

    private def process(inDataset: Dataset[Data]): StreamletQueryExecution = {
      val q1 = inDataset
        .map { d ⇒
          d.name
        }
        .writeStream
        .format("memory")
        .option("truncate", false)
        .queryName("allNames")
        .outputMode(OutputMode.Append())
        .trigger(Trigger.Once)
        .start()

      val q2 = inDataset
        .map { d ⇒
          d.name.toUpperCase
        }
        .writeStream
        .format("memory")
        .option("truncate", false)
        .queryName("allNamesUpper")
        .outputMode(OutputMode.Append())
        .trigger(Trigger.Once)
        .start()
      StreamletQueryExecution(q1, q2)
    }
  }
} 
Example 13
Source File: SparkAvroDecoder.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark.avro

import org.apache.log4j.Logger

import java.io.ByteArrayOutputStream

import scala.reflect.runtime.universe._

import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord }
import org.apache.avro.io.{ DecoderFactory, EncoderFactory }
import org.apache.spark.sql.{ Dataset, Encoder, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.StructType
import org.apache.avro.Schema

import cloudflow.spark.sql.SQLImplicits._

case class EncodedKV(key: String, value: Array[Byte])

case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) {

  val encoder: Encoder[T]                           = implicitly[Encoder[T]]
  val sqlSchema: StructType                         = encoder.schema
  val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema)
  @transient lazy val _avroSchema                   = new Schema.Parser().parse(avroSchema)
  @transient lazy val rowConverter                  = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema)
  @transient lazy val datumReader                   = new GenericDatumReader[GenericRecord](_avroSchema)
  @transient lazy val decoder                       = DecoderFactory.get
  def decode(bytes: Array[Byte]): Row = {
    val binaryDecoder = decoder.binaryDecoder(bytes, null)
    val record        = datumReader.read(null, binaryDecoder)
    rowConverter(record).asInstanceOf[GenericRow]
  }

}


case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) {

  @transient lazy val log = Logger.getLogger(getClass.getName)

  val BufferSize = 5 * 1024 // 5 Kb

  val encoder                     = implicitly[Encoder[T]]
  val sqlSchema                   = encoder.schema
  @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema)

  val recordName                = "topLevelRecord" // ???
  val recordNamespace           = "recordNamespace" // ???
  @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace)

  // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage
  def rowToBytes(row: Row): Array[Byte] = {
    val genRecord = converter(row).asInstanceOf[GenericRecord]
    if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord")
    val datumWriter   = new GenericDatumWriter[GenericRecord](_avroSchema)
    val avroEncoder   = EncoderFactory.get
    val byteArrOS     = new ByteArrayOutputStream(BufferSize)
    val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null)
    datumWriter.write(genRecord, binaryEncoder)
    binaryEncoder.flush()
    byteArrOS.toByteArray
  }

  def encode(dataset: Dataset[T]): Dataset[Array[Byte]] =
    dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]]

  // Note to self: I'm not sure how heavy this chain of transformations is
  def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = {
    val encoder             = encoderFor[T]
    implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind()
    dataset.map { value ⇒
      val key         = keyFun(value)
      val internalRow = encoder.toRow(value)
      val row         = rowEncoder.fromRow(internalRow)
      val bytes       = rowToBytes(row)
      EncodedKV(key, bytes)
    }
  }

} 
Example 14
Source File: TestSparkStreamletContext.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark
package testkit

import java.nio.file.attribute.FileAttribute

import com.typesafe.config._

import scala.reflect.runtime.universe._
import scala.concurrent.duration._
import org.apache.spark.sql.{ Dataset, Encoder, SparkSession }
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger }
import cloudflow.streamlets._
import org.apache.spark.sql.catalyst.InternalRow


class TestSparkStreamletContext(override val streamletRef: String,
                                session: SparkSession,
                                inletTaps: Seq[SparkInletTap[_]],
                                outletTaps: Seq[SparkOutletTap[_]],
                                override val config: Config = ConfigFactory.empty)
    extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config),
                                  session) {
  val ProcessingTimeInterval = 1500.milliseconds
  override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] =
    inletTaps
      .find(_.portName == inPort.name)
      .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In])
      .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}"))

  override def writeStream[Out](stream: Dataset[Out],
                                outPort: CodecOutlet[Out],
                                outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = {
    // RateSource can only work with a microBatch query because it contains no data at time zero.
    // Trigger.Once requires data at start to work.
    val trigger = if (isRateSource(stream)) {
      Trigger.ProcessingTime(ProcessingTimeInterval)
    } else {
      Trigger.Once()
    }
    val streamingQuery = outletTaps
      .find(_.portName == outPort.name)
      .map { outletTap ⇒
        stream.writeStream
          .outputMode(outputMode)
          .format("memory")
          .trigger(trigger)
          .queryName(outletTap.queryName)
          .start()
      }
      .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}"))
    streamingQuery
  }

  override def checkpointDir(dirName: String): String = {
    val fileAttibutes: Array[FileAttribute[_]] = Array()
    val tmpDir                                 = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*)
    tmpDir.toFile.getAbsolutePath
  }

  private def isRateSource(stream: Dataset[_]): Boolean = {
    import org.apache.spark.sql.execution.command.ExplainCommand
    val explain = ExplainCommand(stream.queryExecution.logical, true)
    val res     = session.sessionState.executePlan(explain).executedPlan.executeCollect()
    res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider"))
  }

}

case class TestContextException(portName: String, msg: String) extends RuntimeException(msg) 
Example 15
Source File: SecondaryPairDCFunctions.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import com.bloomberg.sparkflow.partitioning.SecondarySortPartioner
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Encoder

import scala.reflect.ClassTag


class SecondaryPairDCFunctions[K, K2, V](self: DC[((K, K2), V)])
                                        (implicit kt: ClassTag[K],
                                         vt: ClassTag[V],
                                         k2t: ClassTag[K2],
                                         ord: Ordering[(K, K2)] = null,
                                         encoder: Encoder[((K, K2), V)]) {

  def repartitionAndSecondarySortWithinPartitions(): DC[((K, K2), V)] = {
    new RDDTransformDC(encoder, self, (rdd: RDD[((K, K2), V)]) => rdd.repartitionAndSortWithinPartitions(new SecondarySortPartioner[K, K2, V](rdd.partitions.length)), Seq("repartAndSort"))
  }

  def repartitionAndSecondarySortWithinPartitions(numPartitions: Int): DC[((K, K2), V)] = {
    new RDDTransformDC(encoder, self, (rdd: RDD[((K, K2), V)]) => rdd.repartitionAndSortWithinPartitions(new SecondarySortPartioner[K, K2, V](numPartitions)), Seq("repartAndSort", numPartitions.toString))
  }
} 
Example 16
Source File: DatasetTransformDC.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import com.bloomberg.sparkflow.serialization.Hashing._
import org.apache.spark.sql.{Dataset, Encoder, SparkSession}


private[sparkflow] class DatasetTransformDC[U, T]
(encoder: Encoder[U], val prev: DC[T], f: (Dataset[T]) => Dataset[U], hashTargets: Seq[String]) extends DC[U](encoder, Seq(prev)) {
  //
  //  def this(prev: DC[T], f: Dataset[T] => Dataset[U], hashTarget: AnyRef)(implicit tEncoder: Encoder[T], uEncoder: Encoder[U])  = {
  //    this(prev, uEncoder, f, Seq(hashClass(hashTarget)))
  //  }
  //
  //  def this(prev: DC[T], f: Dataset[T] => Dataset[U], hashTarget: AnyRef, hashTargets: Seq[String])(implicit tEncoder: Encoder[T], uEncoder: Encoder[U])  = {
  //    this(prev,uEncoder,  f, hashClass(hashTarget) +: hashTargets)
  //  }

  def computeDataset(spark: SparkSession) = {
    val dataset = f(prev.getDataset(spark))
    dataset
  }

  override def computeSignature() = {
    hashString(prev.getSignature + hashSeq(hashTargets))
  }

} 
Example 17
Source File: MultiGroupedTransformDC.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import com.bloomberg.sparkflow.serialization.Hashing
import org.apache.spark.sql.{Dataset, Encoder, KeyValueGroupedDataset, SparkSession}

import scala.concurrent.duration.Duration
import scala.concurrent.{Await, Future}
import scala.reflect.ClassTag
import scala.concurrent.ExecutionContext.Implicits.global



class MultiGroupedTransformDC[K, V, U, T: ClassTag]
(left: KeyValueGroupedDC[K, V],
 right: KeyValueGroupedDC[K, U],
 f: (KeyValueGroupedDataset[K, V], KeyValueGroupedDataset[K, U]) => Dataset[T])
(implicit tEncoder: Encoder[T]) extends DC[T](tEncoder, Seq(left, right)) {

  override def computeDataset(spark: SparkSession) = {
    val leftFuture = Future{left.get(spark)}
    val rightFuture = Future{right.get(spark)}
    val ld = Await.result(leftFuture, Duration.Inf)
    val rd = Await.result(rightFuture, Duration.Inf)
    val dataset = f(ld, rd)
    dataset
  }

  override def computeSignature() = {
    Hashing.hashString(left.getSignature + right.getSignature + Hashing.hashClass(f))
  }

} 
Example 18
Source File: MultiInputDC.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import com.bloomberg.sparkflow.serialization.Hashing
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Encoder, SparkSession}

import scala.concurrent.duration.Duration
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.{Await, Future}


class MultiInputDC[T, U, V](encoder: Encoder[T], left: DC[U], right: DC[V],
                            f: (RDD[U], RDD[V]) => RDD[T])
  extends DC[T](encoder, Seq(left, right)) {

  override def computeSignature() = {
    Hashing.hashString(left.getSignature + right.getSignature + Hashing.hashClass(f))
  }

  override def computeDataset(spark: SparkSession) = {
    val leftFuture = Future{left.getRDD(spark)}
    val rightFuture = Future{right.getRDD(spark)}
    val leftRDD = Await.result(leftFuture, Duration.Inf)
    val rightRDD = Await.result(rightFuture, Duration.Inf)
    val rdd = f(leftRDD, rightRDD)
    spark.createDataset(rdd)
  }


} 
Example 19
Source File: ResultDepDC.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import com.bloomberg.sparkflow.serialization.Hashing
import com.bloomberg.sparkflow.serialization.Hashing._
import org.apache.spark.sql.{Encoder, SparkSession}


class ResultDepDC[U, T]
(encoder: Encoder[(T, U)], val prev: DC[T], dr: DR[U]) extends DC[(T, U)](encoder, Seq(prev, dr)) {

  override def computeDataset(spark: SparkSession) = {
    val result = dr.get(spark)
    prev.getDataset(spark).mapPartitions(iterator => {
      iterator.map(t => (t, result))
    })
  }

  override def computeSignature() = {
    hashString(prev.getSignature + dr.getSignature)
  }

} 
Example 20
Source File: ExistingRDD.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Encoder, Row, SparkSession}
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.types.DataType
import org.apache.spark.util.Utils

object RDDConversions {
  def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = {
    data.mapPartitions { iterator =>
      val numColumns = outputTypes.length
      val mutableRow = new GenericInternalRow(numColumns)
      val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)
      iterator.map { r =>
        var i = 0
        while (i < numColumns) {
          mutableRow(i) = converters(i)(r.productElement(i))
          i += 1
        }

        mutableRow
      }
    }
  }

  
case class RDDScanExec(
    output: Seq[Attribute],
    rdd: RDD[InternalRow],
    name: String,
    override val outputPartitioning: Partitioning = UnknownPartitioning(0),
    override val outputOrdering: Seq[SortOrder] = Nil) extends LeafExecNode {

  private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("")

  override val nodeName: String = s"Scan $name$rddName"

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    rdd.mapPartitionsWithIndexInternal { (index, iter) =>
      val proj = UnsafeProjection.create(schema)
      proj.initialize(index)
      iter.map { r =>
        numOutputRows += 1
        proj(r)
      }
    }
  }

  override def simpleString: String = {
    s"$nodeName${Utils.truncatedString(output, "[", ",", "]")}"
  }
} 
Example 21
Source File: ReduceAggregator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 22
Source File: Aggregator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 23
Source File: typedaggregators.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this(x => f.call(x))
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 24
Source File: ExistingRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Encoder, Row, SparkSession}
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.types.DataType
import org.apache.spark.util.Utils

object RDDConversions {
  def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = {
    data.mapPartitions { iterator =>
      val numColumns = outputTypes.length
      val mutableRow = new GenericInternalRow(numColumns)
      val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)
      iterator.map { r =>
        var i = 0
        while (i < numColumns) {
          mutableRow(i) = converters(i)(r.productElement(i))
          i += 1
        }

        mutableRow
      }
    }
  }

  
case class RDDScanExec(
    output: Seq[Attribute],
    rdd: RDD[InternalRow],
    override val nodeName: String) extends LeafExecNode {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    rdd.mapPartitionsInternal { iter =>
      val proj = UnsafeProjection.create(schema)
      iter.map { r =>
        numOutputRows += 1
        proj(r)
      }
    }
  }

  override def simpleString: String = {
    s"Scan $nodeName${Utils.truncatedString(output, "[", ",", "]")}"
  }
} 
Example 25
Source File: package.scala    From amadou   with Apache License 2.0 5 votes vote down vote up
package com.mediative.amadou

import com.google.api.services.bigquery.model._
import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
import com.google.cloud.hadoop.io.bigquery._
import org.apache.hadoop.fs.{FileSystem, Path}
import net.ceedubs.ficus.readers.ValueReader
import net.ceedubs.ficus.FicusInstances

import org.apache.spark.sql.{Dataset, SparkSession, Encoder}
import java.util.concurrent.ThreadLocalRandom
import scala.collection.JavaConversions._

package object bigquery extends FicusInstances {

  object CreateDisposition extends Enumeration {
    val CREATE_IF_NEEDED, CREATE_NEVER = Value
  }

  object WriteDisposition extends Enumeration {
    val WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY = Value
  }

  val BQ_CSV_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss zzz"

  object TableNotFound {
    import com.google.api.client.googleapis.json.GoogleJsonResponseException
    import com.google.api.client.googleapis.json.GoogleJsonError
    import scala.collection.JavaConverters._

    def unapply(error: Throwable): Option[GoogleJsonError.ErrorInfo] = error match {
      case error: GoogleJsonResponseException =>
        Some(error.getDetails)
          .filter(_.getCode == 404)
          .flatMap(_.getErrors.asScala.find(_.getReason == "notFound"))
      case _ => None
    }
  }

  def tableHasDataForDate(
      spark: SparkSession,
      table: TableReference,
      date: java.sql.Date,
      column: String): Boolean = {
    val bq = BigQueryClient.getInstance(spark.sparkContext.hadoopConfiguration)
    bq.hasDataForDate(table, date, column)
  }

  
    def saveAsBigQueryTable(
        tableRef: TableReference,
        writeDisposition: WriteDisposition.Value,
        createDisposition: CreateDisposition.Value): Unit = {
      val bucket = conf.get(BigQueryConfiguration.GCS_BUCKET_KEY)
      val temp =
        s"spark-bigquery-${System.currentTimeMillis()}=${ThreadLocalRandom.current.nextInt(Int.MaxValue)}"
      val gcsPath = s"gs://$bucket/spark-bigquery-tmp/$temp"
      self.write.json(gcsPath)

      val schemaFields = self.schema.fields.map { field =>
        import org.apache.spark.sql.types._

        val fieldType = field.dataType match {
          case BooleanType    => "BOOLEAN"
          case LongType       => "INTEGER"
          case IntegerType    => "INTEGER"
          case StringType     => "STRING"
          case DoubleType     => "FLOAT"
          case TimestampType  => "TIMESTAMP"
          case _: DecimalType => "INTEGER"
        }
        new TableFieldSchema().setName(field.name).setType(fieldType)
      }.toList

      val tableSchema = new TableSchema().setFields(schemaFields)

      bq.load(gcsPath, tableRef, tableSchema, writeDisposition, createDisposition)
      delete(new Path(gcsPath))
    }

    private def delete(path: Path): Unit = {
      val fs = FileSystem.get(path.toUri, conf)
      fs.delete(path, true)
      ()
    }

  }

  implicit val valueReader: ValueReader[BigQueryTable.PartitionStrategy] =
    ValueReader[String].map {
      _ match {
        case "month" => BigQueryTable.PartitionByMonth
        case "day"   => BigQueryTable.PartitionByDay
        case other   => sys.error(s"Unknown partition strategy")
      }
    }
} 
Example 26
Source File: GroupSortedDataset.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted.sql

import scala.reflect.ClassTag

import org.apache.spark.sql.{ Column, Dataset, Encoder }
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder }

import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate }

object GroupSortedDataset {
  private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = {
    val key = col(dataset.columns.head)
    val valueSort = {
      val sort = sortBy(col(dataset.columns.last))
      if (reverse) sort.desc else sort.asc
    }
    new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort))
  }
}

class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable {
  def toDS: Dataset[(K, V)] = dataset

  def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f))
  }

  def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIterator(_)(f))
  }

  def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f)))
  }

  def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] =
    mapStreamByKey(iter => Iterator(iter.reduceLeft(f)))

  def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(_.scanLeft(wCreate())(f))
  }
} 
Example 27
Source File: ReduceAggregator.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  @transient private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 28
Source File: Aggregator.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 29
Source File: typedaggregators.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any])
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 30
Source File: cogroup.scala    From spark-tools   with Apache License 2.0 5 votes vote down vote up
package io.univalence.plumbus

import org.apache.spark.Partitioner
import org.apache.spark.rdd.{ CoGroupedRDD, RDD }
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ ArrayType, StructField }
import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row }

import scala.reflect.ClassTag
import scala.util.Try

object cogroup {

  
  implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) {
    def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] =
      //Use SparkAddOn ?
      ???
  }

  def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)(
    implicit encA: Encoder[A],
    encB: Encoder[B],
    encC: Encoder[K],
    enc: Encoder[(K, Seq[A], Seq[B])],
    ca: ClassTag[A],
    ck: ClassTag[K],
    cb: ClassTag[B]
  ): Dataset[(K, Seq[A], Seq[B])] =
    left.sparkSession.implicits
      .rddToDatasetHolder(
        RDD
          .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft))
          .cogroup(right.rdd.keyBy(keyRight))
          .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) })
      )
      .toDS

  def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)(
    byKey: String,
    partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*)
  ): Try[DataFrame] =
    Try {
      val subGroup: Seq[DataFrame]  = namedSubGroup.map(_._2)
      val allFrames: Seq[DataFrame] = group +: subGroup
      val allFramesKeyed: Seq[RDD[(String, Row)]] =
        allFrames.map(df => {
          val idx = df.columns.indexOf(byKey)
          df.rdd.keyBy(_.get(idx).toString)
        })

      val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner)

      val rowRdd: RDD[Row] =
        cogroupRdd.map(x => {
          val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq)
          val seq                   = rows.head.head.toSeq ++ rows.tail

          new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row]
        })

      val schema =
        types.StructType(
          group.schema.fields
            ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) }
        )

      group.sparkSession.createDataFrame(rowRdd, schema)
    }

} 
Example 31
Source File: CassandraStorage.scala    From graphsense-transformation   with MIT License 5 votes vote down vote up
package at.ac.ait.storage

import com.datastax.spark.connector.rdd.ValidRDDType
import com.datastax.spark.connector.rdd.reader.RowReaderFactory
import com.datastax.spark.connector.writer.{RowWriterFactory}
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import org.apache.spark.sql.{Dataset, Encoder, SparkSession}
import scala.reflect.ClassTag

import at.ac.ait.Util._

class CassandraStorage(spark: SparkSession) {

  import spark.implicits._
  import com.datastax.spark.connector._

  def load[T <: Product: ClassTag: RowReaderFactory: ValidRDDType: Encoder](
      keyspace: String,
      tableName: String,
      columns: ColumnRef*
  ) = {
    spark.sparkContext.setJobDescription(s"Loading table ${tableName}")
    val table = spark.sparkContext.cassandraTable[T](keyspace, tableName)
    if (columns.isEmpty)
      table.toDS().as[T]
    else
      table.select(columns: _*).toDS().as[T]
  }

  def store[T <: Product: RowWriterFactory](
      keyspace: String,
      tableName: String,
      df: Dataset[T]
  ) = {

    spark.sparkContext.setJobDescription(s"Writing table ${tableName}")
    val dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
    val timestamp = LocalDateTime.now().format(dtf)
    println(s"[$timestamp] Writing table ${tableName}")
    time { df.rdd.saveToCassandra(keyspace, tableName) }
  }
} 
Example 32
Source File: package.scala    From sparksql-scalapb   with Apache License 2.0 5 votes vote down vote up
package scalapb

import org.apache.spark.sql.{DataFrame, Encoder, SQLContext, SparkSession}

import scala.reflect.ClassTag

package object spark {
  implicit class ProtoSQLContext(val sqlContext: SQLContext) extends AnyVal {
    def protoToDataFrame[T <: GeneratedMessage: Encoder](
        protoRdd: org.apache.spark.rdd.RDD[T]
    ) = {
      ProtoSQL.protoToDataFrame(sqlContext, protoRdd)
    }
  }

  implicit class ProtoRDD[T <: GeneratedMessage](
      val protoRdd: org.apache.spark.rdd.RDD[T]
  ) extends AnyVal {
    def toDataFrame(
        sqlContext: SQLContext
    )(implicit encoder: Encoder[T]): DataFrame = {
      ProtoSQL.protoToDataFrame(sqlContext, protoRdd)
    }

    def toDataFrame(
        sparkSession: SparkSession
    )(implicit encoder: Encoder[T]): DataFrame = {
      ProtoSQL.protoToDataFrame(sparkSession, protoRdd)
    }
  }
} 
Example 33
Source File: Encoders.scala    From magellan   with Apache License 2.0 5 votes vote down vote up
package magellan.encoders

import magellan._
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._

import scala.reflect._

object Encoders {

  implicit def encoderForPoint: Encoder[Point] = {
    val sqlType = new PointUDT().sqlType
    ExpressionEncoder[Point](
      schema = sqlType,
      flat = true,
      serializer = Seq(
        MagellanSerializer(
          BoundReference(0, ObjectType(classOf[Point]), nullable = true), sqlType)),
      deserializer =
        MagellanDeserializer(
          GetColumnByOrdinal(0, sqlType), classOf[Point]),
      clsTag = classTag[Point])
  }

  implicit def encoderForPolygon: Encoder[Polygon] = {
    val sqlType = new PolygonUDT().sqlType
    ExpressionEncoder[Polygon](
      schema = sqlType,
      flat = true,
      serializer = Seq(
        MagellanSerializer(
          BoundReference(0, ObjectType(classOf[Polygon]), nullable = true), sqlType)),
      deserializer =
        MagellanDeserializer(
          GetColumnByOrdinal(0, sqlType), classOf[Polygon]),
      clsTag = classTag[Polygon])
  }

  implicit def encoderForPolyLine: Encoder[PolyLine] = {
    val sqlType = new PolyLineUDT().sqlType
    ExpressionEncoder[PolyLine](
      schema = sqlType,
      flat = true,
      serializer = Seq(
        MagellanSerializer(
          BoundReference(0, ObjectType(classOf[PolyLine]), nullable = true), sqlType)),
      deserializer =
        MagellanDeserializer(
          GetColumnByOrdinal(0, sqlType), classOf[PolyLine]),
      clsTag = classTag[PolyLine])
  }

} 
Example 34
Source File: CustomSinkSuite.scala    From spark-structured-streaming-ml   with Apache License 2.0 5 votes vote down vote up
package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase

import scala.collection.mutable.ListBuffer

import org.scalatest.FunSuite

import org.apache.spark._
import org.apache.spark.sql.{Dataset, DataFrame, Encoder, SQLContext}
import org.apache.spark.sql.execution.streaming.MemoryStream

class CustomSinkSuite extends FunSuite with DataFrameSuiteBase {

  test("really simple test of the custom sink") {
    import spark.implicits._
    val input = MemoryStream[String]
    val doubled = input.toDS().map(x => x + " " + x)
    val formatName = ("com.highperformancespark.examples" +
      "structuredstreaming.CustomSinkCollectorProvider")
    val query = doubled.writeStream
      .queryName("testCustomSinkBasic")
      .format(formatName)
      .start()
    val inputData = List("hi", "holden", "bye", "pandas")
    input.addData(inputData)
    assert(query.isActive === true)
    query.processAllAvailable()
    assert(query.exception === None)
    assert(Pandas.results(0) === inputData.map(x => x + " " + x))
  }
}

object Pandas{
  val results = new ListBuffer[Seq[String]]()
}

class CustomSinkCollectorProvider extends ForeachDatasetSinkProvider {
  override def func(df: DataFrame) {
    val spark = df.sparkSession
    import spark.implicits._
    Pandas.results += df.as[String].rdd.collect()
  }
} 
Example 35
Source File: DatasetGenerator.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, SQLContext}
import org.scalacheck.{Arbitrary, Gen}

import scala.reflect.ClassTag

object DatasetGenerator {

  
  def arbitrarySizedDataset[T: ClassTag : Encoder]
    (sqlCtx: SQLContext, minPartitions: Int = 1)
    (generator: Int => Gen[T]): Arbitrary[Dataset[T]] = {

    val rddGen: Gen[RDD[T]] =
      RDDGenerator.genSizedRDD[T](sqlCtx.sparkContext, minPartitions)(generator)
    val datasetGen: Gen[Dataset[T]] =
      rddGen.map(rdd => sqlCtx.createDataset(rdd))

    Arbitrary {
      datasetGen
    }
  }
} 
Example 36
Source File: ReduceAggregator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 37
Source File: Aggregator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 38
Source File: typedaggregators.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this(x => f.call(x))
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}