org.apache.spark.sql.types Scala Examples

The following examples show how to use org.apache.spark.sql.types. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: cogroup.scala    From spark-tools   with Apache License 2.0 5 votes vote down vote up
package io.univalence.plumbus

import org.apache.spark.Partitioner
import org.apache.spark.rdd.{ CoGroupedRDD, RDD }
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ ArrayType, StructField }
import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row }

import scala.reflect.ClassTag
import scala.util.Try

object cogroup {

  
  implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) {
    def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] =
      //Use SparkAddOn ?
      ???
  }

  def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)(
    implicit encA: Encoder[A],
    encB: Encoder[B],
    encC: Encoder[K],
    enc: Encoder[(K, Seq[A], Seq[B])],
    ca: ClassTag[A],
    ck: ClassTag[K],
    cb: ClassTag[B]
  ): Dataset[(K, Seq[A], Seq[B])] =
    left.sparkSession.implicits
      .rddToDatasetHolder(
        RDD
          .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft))
          .cogroup(right.rdd.keyBy(keyRight))
          .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) })
      )
      .toDS

  def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)(
    byKey: String,
    partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*)
  ): Try[DataFrame] =
    Try {
      val subGroup: Seq[DataFrame]  = namedSubGroup.map(_._2)
      val allFrames: Seq[DataFrame] = group +: subGroup
      val allFramesKeyed: Seq[RDD[(String, Row)]] =
        allFrames.map(df => {
          val idx = df.columns.indexOf(byKey)
          df.rdd.keyBy(_.get(idx).toString)
        })

      val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner)

      val rowRdd: RDD[Row] =
        cogroupRdd.map(x => {
          val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq)
          val seq                   = rows.head.head.toSeq ++ rows.tail

          new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row]
        })

      val schema =
        types.StructType(
          group.schema.fields
            ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) }
        )

      group.sparkSession.createDataFrame(rowRdd, schema)
    }

} 
Example 2
Source File: TimeType.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.timeseries.time.types

import com.twosigma.flint.FlintConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.{ SQLContext, SparkSession, types }

trait TimeType {
  
  def roundDownPrecision(nanosSinceEpoch: Long): Long
}

object TimeType {
  case object LongType extends TimeType {
    override def internalToNanos(value: Long): Long = value
    override def nanosToInternal(nanos: Long): Long = nanos
    override def roundDownPrecision(nanos: Long): Long = nanos
  }

  // Spark sql represents timestamp as microseconds internally
  case object TimestampType extends TimeType {
    override def internalToNanos(value: Long): Long = value * 1000
    override def nanosToInternal(nanos: Long): Long = nanos / 1000
    override def roundDownPrecision(nanos: Long): Long = nanos - nanos % 1000
  }

  def apply(timeType: String): TimeType = {
    timeType match {
      case "long" => LongType
      case "timestamp" => TimestampType
      case _ => throw new IllegalAccessException(s"Unsupported time type: ${timeType}. " +
        s"Only `long` and `timestamp` are supported.")
    }
  }

  def apply(sqlType: types.DataType): TimeType = {
    sqlType match {
      case types.LongType => LongType
      case types.TimestampType => TimestampType
      case _ => throw new IllegalArgumentException(s"Unsupported time type: ${sqlType}")
    }
  }

  def get(sparkSession: SparkSession): TimeType = {
    TimeType(sparkSession.conf.get(
      FlintConf.TIME_TYPE_CONF, FlintConf.TIME_TYPE_DEFAULT
    ))
  }
} 
Example 3
Source File: SparkLeapFrame.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.spark

import ml.combust.mleap.core.types.{StructField, StructType}
import ml.combust.mleap.runtime.frame.{FrameBuilder, Row, RowUtil}
import ml.combust.mleap.runtime.function.{Selector, UserDefinedFunction}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql
import org.apache.spark.sql.mleap.TypeConverters
import org.apache.spark.sql.{DataFrame, SQLContext, types}

import scala.util.Try


case class SparkLeapFrame(schema: StructType,
                          dataset: RDD[Row],
                          sqlContext: SQLContext) extends FrameBuilder[SparkLeapFrame] {
  override def withColumn(output: String, inputs: Selector *)
                         (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap {
      rowSelectors =>
        val field = StructField(output, udf.outputTypes.head)

        schema.withField(field).map {
          schema2 =>
            val dataset2 = dataset.map {
              row => row.withValue(rowSelectors: _*)(udf)
            }
            copy(schema = schema2, dataset = dataset2)
        }
    }
  }

  override def withColumns(outputs: Seq[String], inputs: Selector*)
                          (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap {
      rowSelectors =>
        val fields = outputs.zip(udf.outputTypes).map {
          case (name, dt) => StructField(name, dt)
        }

        schema.withFields(fields).map {
          schema2 =>
            val dataset2 = dataset.map {
              row => row.withValues(rowSelectors: _*)(udf)
            }
            copy(schema = schema2, dataset = dataset2)
        }
    }
  }

  override def select(fieldNames: String *): Try[SparkLeapFrame] = {
    for(indices <- schema.indicesOf(fieldNames: _*);
      schema2 <- schema.selectIndices(indices: _*)) yield {
      val dataset2 = dataset.map(row => row.selectIndices(indices: _*))

      copy(schema = schema2, dataset = dataset2)
    }
  }

  override def drop(names: String*): Try[SparkLeapFrame] = {
    for(indices <- schema.indicesOf(names: _*);
        schema2 <- schema.dropIndices(indices: _*)) yield {
      val dataset2 = dataset.map(row => row.dropIndices(indices: _*))

      copy(schema = schema2, dataset = dataset2)
    }
  }

  override def filter(selectors: Selector*)
                     (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, selectors: _*)(udf).map {
      rowSelectors =>
        val dataset2 = dataset.filter(row => row.shouldFilter(rowSelectors: _*)(udf))
        copy(schema = schema, dataset = dataset2)
    }
  }

  def toSpark: DataFrame = {
    val spec = schema.fields.map(TypeConverters.mleapToSparkConverter)
    val fields = spec.map(_._1)
    val converters = spec.map(_._2)
    val sparkSchema = new types.StructType(fields.toArray)
    val data = dataset.map {
      r =>
        val values = r.zip(converters).map {
          case (v, c) => c(v)
        }
        sql.Row(values.toSeq: _*)
    }

    sqlContext.createDataFrame(data, sparkSchema)
  }
} 
Example 4
Source File: LeapFrameToSpark.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.converter

import com.truecar.mleap.core.linalg.Vector
import com.truecar.mleap.runtime.types.StructType
import com.truecar.mleap.spark.{SparkLeapFrame, MleapSparkSupport}
import org.apache.spark.sql.{types, Row, DataFrame, SQLContext}
import MleapSparkSupport._


trait LeapFrameToSpark[T] {
  def toSpark(t: T)(implicit sqlContext: SQLContext): DataFrame
}

case class LeapFrameToSparkWrapper[T: LeapFrameToSpark](t: T) {
  def toSpark(implicit sqlContext: SQLContext): DataFrame = {
    implicitly[LeapFrameToSpark[T]].toSpark(t)
  }
}

object LeapFrameToSpark {
  implicit object SparkLeapFrameToSpark extends LeapFrameToSpark[SparkLeapFrame] {
    override def toSpark(t: SparkLeapFrame)
                        (implicit sqlContext: SQLContext): DataFrame = {
      val outputNames = t.schema.fields.map(_.name).toSet -- t.sparkSchema.fields.map(_.name).toSet
      val outputs = outputNames.map {
        name => (t.schema(name), t.schema.indexOf(name))
      }.toArray.sortBy(_._2)
      val (outputFields, outputIndices) = outputs.unzip
      val outputMleapSchema = StructTypeToSpark(StructType(outputFields)).toSpark
      val outputSchema = types.StructType(t.sparkSchema.fields ++ outputMleapSchema.fields)

      val rows = t.dataset.rdd.map {
        case (mleapRow, sparkValues) =>
          val mleapData = outputIndices.map {
            index =>
              mleapRow.get(index) match {
                case value: Vector => value.toSpark
                case value => value
              }
          }

          Row(sparkValues ++ mleapData: _*)
      }

      sqlContext.createDataFrame(rows, outputSchema)
    }
  }
}