org.apache.spark.sql.catalyst.util.DateTimeUtils Scala Examples

The following examples show how to use org.apache.spark.sql.catalyst.util.DateTimeUtils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: KustoCsvSerializationUtils.scala From azure-kusto-spark with Apache License 2.0

6 votes

package com.microsoft.kusto.spark.datasink

import java.util.TimeZone

import com.microsoft.kusto.spark.utils.DataTypeMapping
import org.apache.commons.lang3.time.FastDateFormat
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types.DataTypes._
import org.apache.spark.sql.types.StructType

private[kusto] class KustoCsvSerializationUtils (val schema: StructType, timeZone: String){
  private[kusto] val dateFormat = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSXXX", TimeZone.getTimeZone(timeZone))

  private[kusto] def convertRow(row: InternalRow) = {
    val values = new Array[String](row.numFields)
    for (i <- 0 until row.numFields if !row.isNullAt(i))
    {
      val dataType = schema.fields(i).dataType
      values(i) = dataType match {
          case DateType => DateTimeUtils.toJavaDate(row.getInt(i)).toString
          case TimestampType => dateFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(i)))
          case _ => row.get(i, dataType).toString
        }
    }

    values
  }
}

private[kusto] object KustoCsvMapper {
    import org.apache.spark.sql.types.StructType
    import org.json

    def createCsvMapping(schema: StructType): String = {
      val csvMapping = new json.JSONArray()

      for (i <- 0 until schema.length)
      {
        val field = schema.apply(i)
        val dataType = field.dataType
        val mapping = new json.JSONObject()
        mapping.put("Name", field.name)
        mapping.put("Ordinal", i)
        mapping.put("DataType", DataTypeMapping.sparkTypeToKustoTypeMap.getOrElse(dataType, StringType))

        csvMapping.put(mapping)
      }

      csvMapping.toString
    }
  }

Example 2

Source File: ComputeCurrentTimeSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.catalyst.util.DateTimeUtils

class ComputeCurrentTimeSuite extends PlanTest {
  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime))
  }

  test("analyzer should replace current_timestamp with literals") {
    val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()),
      LocalRelation())

    val min = System.currentTimeMillis() * 1000
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = (System.currentTimeMillis() + 1) * 1000

    val lits = new scala.collection.mutable.ArrayBuffer[Long]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Long]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }

  test("analyzer should replace current_date with literals") {
    val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation())

    val min = DateTimeUtils.millisToDays(System.currentTimeMillis())
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = DateTimeUtils.millisToDays(System.currentTimeMillis())

    val lits = new scala.collection.mutable.ArrayBuffer[Int]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Int]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }
}

Example 3

Source File: TimeColumnBuffer.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.colbuffer.time

import java.nio.ByteBuffer
import java.sql.Timestamp
import java.util.{ Calendar, TimeZone }

import org.apache.spark.sql.catalyst.util.DateTimeUtils

import com.actian.spark_vector.ComposePartial
import com.actian.spark_vector.colbuffer._
import com.actian.spark_vector.colbuffer.util._
import com.actian.spark_vector.vector.VectorDataType

private case class TimeColumnBufferParams(cbParams: ColumnBufferBuildParams,
  converter: TimeConversion.TimeConverter,
  adjustToUTC: Boolean = false)

private[colbuffer] abstract class TimeColumnBuffer(p: TimeColumnBufferParams, valueWidth: Int)
    extends ColumnBuffer[Timestamp, Long](p.cbParams.name, p.cbParams.maxValueCount, valueWidth, valueWidth, p.cbParams.nullable) {
  private val ts = new Timestamp(System.currentTimeMillis())
  private val cal = Calendar.getInstance

  override def put(source: Timestamp, buffer: ByteBuffer): Unit = {
    if (p.adjustToUTC) {
      TimeConversion.convertLocalTimestampToUTC(source, cal)
    }
    val convertedSource = p.converter.convert(TimeConversion.normalizeTime(source), p.cbParams.scale)
    putConverted(convertedSource, buffer)
  }

  protected def putConverted(converted: Long, buffer: ByteBuffer): Unit

  override def get(buffer: ByteBuffer): Long = {
    val deconvertedSource = p.converter.deconvert(getConverted(buffer), p.cbParams.scale)
    ts.setTime(TimeConversion.scaleNanos(deconvertedSource, MillisecondsScale))
    ts.setNanos((deconvertedSource % PowersOfTen(NanosecondsScale)).toInt)
    if (p.adjustToUTC) {
      TimeConversion.convertUTCToLocalTimestamp(ts, cal)
    }
    DateTimeUtils.fromJavaTimestamp(ts)
  }

  protected def getConverted(buffer: ByteBuffer): Long
}

private class TimeIntColumnBuffer(p: TimeColumnBufferParams) extends TimeColumnBuffer(p, IntSize) {
  override protected def putConverted(converted: Long, buffer: ByteBuffer): Unit = buffer.putInt(converted.toInt)

  override protected def getConverted(buffer: ByteBuffer): Long = buffer.getInt()
}

private class TimeLongColumnBuffer(p: TimeColumnBufferParams) extends TimeColumnBuffer(p, LongSize) {
  override protected def putConverted(converted: Long, buffer: ByteBuffer): Unit = buffer.putLong(converted)

  override protected def getConverted(buffer: ByteBuffer): Long = buffer.getLong()
}

private class TimeNZLZConverter extends TimeConversion.TimeConverter {
  override def convert(unscaledNanos: Long, scale: Int): Long = TimeConversion.scaleNanos(unscaledNanos, scale)

  override def deconvert(scaledNanos: Long, scale: Int): Long = TimeConversion.unscaleNanos(scaledNanos, scale)
}

private class TimeTZConverter extends TimeConversion.TimeConverter {
  override def convert(unscaledNanos: Long, scale: Int): Long =
    (TimeConversion.scaleNanos(unscaledNanos, scale) << TimeMaskSize)

  override def deconvert(scaledNanos: Long, scale: Int): Long =
    TimeConversion.unscaleNanos(scaledNanos >> TimeMaskSize, scale)
}


private[colbuffer] object TimeColumnBuffer extends ColumnBufferBuilder {
  private final val (nzlzIntScaleBounds, nzlzLongScaleBounds) = ((0, 4), (5, 9))
  private final val (tzIntScaleBounds, tzLongScaleBounds) = ((0, 1), (2, 9))
  private val calIsNotUTC = Calendar.getInstance.getTimeZone != TimeZone.getTimeZone("UTC")

  private val buildNZPartial: PartialFunction[ColumnBufferBuildParams, TimeColumnBufferParams] =
    ofDataType(VectorDataType.TimeType) andThen { TimeColumnBufferParams(_, new TimeNZLZConverter(), calIsNotUTC) }

  private val buildLZPartial: PartialFunction[ColumnBufferBuildParams, TimeColumnBufferParams] =
    ofDataType(VectorDataType.TimeLTZType) andThen { TimeColumnBufferParams(_, new TimeNZLZConverter()) }

  private val buildNZLZ: PartialFunction[ColumnBufferBuildParams, ColumnBuffer[_, _]] = (buildNZPartial orElse buildLZPartial) andThenPartial {
    case nzlz if isInBounds(nzlz.cbParams.scale, nzlzIntScaleBounds) => new TimeIntColumnBuffer(nzlz)
    case nzlz if isInBounds(nzlz.cbParams.scale, nzlzLongScaleBounds) => new TimeLongColumnBuffer(nzlz)
  }

  private val buildTZPartial: PartialFunction[ColumnBufferBuildParams, TimeColumnBufferParams] =
    ofDataType(VectorDataType.TimeTZType) andThen { TimeColumnBufferParams(_, new TimeTZConverter()) }

  private val buildTZ: PartialFunction[ColumnBufferBuildParams, ColumnBuffer[_, _]] = buildTZPartial andThenPartial {
    case tz if isInBounds(tz.cbParams.scale, tzIntScaleBounds) => new TimeIntColumnBuffer(tz)
    case tz if isInBounds(tz.cbParams.scale, tzLongScaleBounds) => new TimeLongColumnBuffer(tz)
  }

  override private[colbuffer] val build: PartialFunction[ColumnBufferBuildParams, ColumnBuffer[_, _]] = buildNZLZ orElse buildTZ
}

Example 4

Source File: Neo4jUtils.scala From neo4j-spark-connector with Apache License 2.0

5 votes

package org.neo4j.spark.utils
import java.sql.Timestamp
import java.time._
import java.util.concurrent.Callable
import java.util.function

import io.github.resilience4j.retry.{Retry, RetryConfig}
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.neo4j.driver.exceptions.{ServiceUnavailableException, SessionExpiredException, TransientException}
import org.neo4j.driver.{Driver, Result, Session, Transaction}
import org.neo4j.spark.Neo4jConfig
import org.slf4j.LoggerFactory

class Neo4jUtils

object Neo4jUtils {

  private val logger = LoggerFactory.getLogger(classOf[Neo4jUtils])

  def close(driver: Driver, session: Session): Unit = {
    try {
      if (session != null && session.isOpen) {
        closeSafety(session)
      }
    } finally {
      if (driver != null) {
        closeSafety(driver)
      }
    }
  }

  private def closeSafety(closable: AutoCloseable): Unit = {
    try {
      closable.close()
    } catch {
      case e: Throwable => {
        logger.error("Exception while trying to close an AutoCloseable, because of the following exception", e)
      }
    }
  }

  private val retryConfig = RetryConfig.custom.retryExceptions(
      classOf[SessionExpiredException], classOf[ServiceUnavailableException] // retry on the same exceptions the driver does [1]
    )
    .retryOnException(new function.Predicate[Throwable] {
      override def test(exception: Throwable): Boolean = exception match {
        case t: TransientException => {
          val code = t.code()
          !("Neo.TransientError.Transaction.Terminated" == code) && !("Neo.TransientError.Transaction.LockClientStopped" == code)
        }
        case _ => false
      }
    })
    .maxAttempts(3)
    .build

  def executeTxWithRetries[T](neo4jConfig: Neo4jConfig,
                              query: String,
                              params: java.util.Map[String, AnyRef],
                              write: Boolean): (Driver, Session, Transaction, Result) = {
    val driver: Driver = neo4jConfig.driver()
    val session: Session = driver.session(neo4jConfig.sessionConfig(write))
    Retry.decorateCallable(
        Retry.of("neo4jTransactionRetryPool", retryConfig),
        new Callable[(Driver, Session, Transaction, Result)] {
          override def call(): (Driver, Session, Transaction, Result) = {
            val transaction = session.beginTransaction()
            val result = transaction.run(query, params)
            (driver, session, transaction, result)
          }
        }
      )
      .call()
  }

  def convert(value: AnyRef): AnyRef = value match {
    case m: ZonedDateTime => new Timestamp(DateTimeUtils.fromUTCTime(m.toInstant.toEpochMilli, m.getZone.getId))
    case m: LocalDateTime => new Timestamp(DateTimeUtils.fromUTCTime(m.toInstant(ZoneOffset.UTC).toEpochMilli,"UTC"))
    case m: LocalDate => java.sql.Date.valueOf(m)
    case m: OffsetTime => new Timestamp(m.atDate(LocalDate.ofEpochDay(0)).toInstant.toEpochMilli)
    case _ => value
  }

}

Example 5

Source File: Executor.scala From neo4j-spark-connector with Apache License 2.0

5 votes

package org.neo4j.spark

import java.time.{LocalDate, LocalDateTime, OffsetTime, ZoneOffset, ZonedDateTime}
import java.util
import java.sql.Timestamp

import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types.StructType
import org.neo4j.spark.dataframe.CypherTypes
import org.neo4j.spark.utils.{Neo4jSessionAwareIterator, Neo4jUtils}

import scala.collection.JavaConverters._


object Executor {

  def convert(value: AnyRef): Any = value match {
    case it: util.Collection[_] => it.toArray()
    case m: java.util.Map[_,_] => m.asScala
    case _ => Neo4jUtils.convert(value)
  }

  def toJava(parameters: Map[String, Any]): java.util.Map[String, Object] = {
    parameters.mapValues(toJava).asJava
  }

  private def toJava(x: Any): AnyRef = x match {
    case y: Seq[_] => y.asJava
    case _ => x.asInstanceOf[AnyRef]
  }

  val EMPTY = Array.empty[Any]

  val EMPTY_RESULT = new CypherResult(new StructType(), Iterator.empty)

  class CypherResult(val schema: StructType, val rows: Iterator[Array[Any]]) {
    def sparkRows: Iterator[Row] = rows.map(row => new GenericRowWithSchema(row, schema))

    def fields = schema.fieldNames
  }

  def execute(sc: SparkContext, query: String, parameters: Map[String, AnyRef]): CypherResult = {
    execute(Neo4jConfig(sc.getConf), query, parameters)
  }

  private def rows(result: Iterator[_]) = {
    var i = 0
    while (result.hasNext) i = i + 1
    i
  }

  def execute(config: Neo4jConfig, query: String, parameters: Map[String, Any], write: Boolean = false): CypherResult = {
    val result = new Neo4jSessionAwareIterator(config, query, toJava(parameters), write)
    if (!result.hasNext) {
      return EMPTY_RESULT
    }
    val peek = result.peek()
    val keyCount = peek.size()
    if (keyCount == 0) {
      return new CypherResult(new StructType(), Array.fill[Array[Any]](rows(result))(EMPTY).toIterator)
    }
    val keys = peek.keys().asScala
    val fields = keys.map(k => (k, peek.get(k).`type`())).map(keyType => CypherTypes.field(keyType))
    val schema = StructType(fields)
    val it = result.map(record => {
      val row = new Array[Any](keyCount)
      var i = 0
      while (i < keyCount) {
        val value = convert(record.get(i).asObject())
        row.update(i, value)
        i = i + 1
      }
      row
    })
    new CypherResult(schema, it)
  }
}

Example 6

Source File: DataFramePrettyPrinter.scala From lighthouse with Apache License 2.0

5 votes

package be.dataminded.lighthouse.testing

import java.sql.Date

import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.util.DateTimeUtils

private[testing] object DataFramePrettyPrinter {

  def prettyPrintDataFrame(df: DataFrame, number: Int, truncate: Int = 20): String = {
    val numRows     = number.max(0)
    val takeResult  = df.take(numRows + 1)
    val hasMoreData = takeResult.length > numRows
    val data        = takeResult.take(numRows)

    val header = df.schema.fieldNames.toSeq

    def asReadableRows = {
      data.map { row =>
        row.toSeq.map { cell =>
          val str = cell match {
            case null                => "null"
            case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
            case array: Array[_]     => array.mkString("[", ", ", "]")
            case seq: Seq[_]         => seq.mkString("[", ", ", "]")
            case d: Date =>
              DateTimeUtils.dateToString(DateTimeUtils.fromJavaDate(d))
            case _ => cell.toString
          }
          if (truncate > 0 && str.length > truncate) {
            // do not show ellipses for strings shorter than 4 characters.
            if (truncate < 4) str.substring(0, truncate)
            else str.substring(0, truncate - 3) + "..."
          } else {
            str
          }
        }: Seq[String]
      }
    }

    // For array values, replace Seq and Array with square brackets
    // For cells that are beyond `truncate` characters, replace it with the
    // first `truncate-3` and "..."
    val rows: Seq[Seq[String]] = header +: asReadableRows

    val sb = new StringBuilder

    // Initialise the width of each column to a minimum value of '3'
    val colWidths = Array.fill(header.length)(3)

    // Compute the width of each column
    for (row <- rows) {
      for ((cell, i) <- row.zipWithIndex) {
        colWidths(i) = math.max(colWidths(i), cell.length)
      }
    }

    // Create SeparateLine
    val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()

    // column names
    rows.head.zipWithIndex
      .map {
        case (cell, i) =>
          if (truncate > 0) {
            StringUtils.leftPad(cell, colWidths(i))
          } else {
            StringUtils.rightPad(cell, colWidths(i))
          }
      }
      .addString(sb, "|", "|", "|\n")

    sb.append(sep)

    // data
    rows.tail.map {
      _.zipWithIndex
        .map {
          case (cell, i) =>
            if (truncate > 0) {
              StringUtils.leftPad(cell.toString, colWidths(i))
            } else {
              StringUtils.rightPad(cell.toString, colWidths(i))
            }
        }
        .addString(sb, "|", "|", "|\n")
    }

    sb.append(sep)

    // For Data that has more than "numRows" records
    if (hasMoreData) {
      val rowsString = if (numRows == 1) "row" else "rows"
      sb.append(s"only showing top $numRows $rowsString\n")
    }

    sb.toString()
  }
}

Example 7

Source File: JacksonGenerator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.json

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{MapData, ArrayData, DateTimeUtils}

import scala.collection.Map

import com.fasterxml.jackson.core._

import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

private[sql] object JacksonGenerator {
  
  def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = {
    def valWriter: (DataType, Any) => Unit = {
      case (_, null) | (NullType, _) => gen.writeNull()
      case (StringType, v) => gen.writeString(v.toString)
      case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString)
      case (IntegerType, v: Int) => gen.writeNumber(v)
      case (ShortType, v: Short) => gen.writeNumber(v)
      case (FloatType, v: Float) => gen.writeNumber(v)
      case (DoubleType, v: Double) => gen.writeNumber(v)
      case (LongType, v: Long) => gen.writeNumber(v)
      case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal)
      case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
      case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
      case (BooleanType, v: Boolean) => gen.writeBoolean(v)
      case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString)
      // For UDT values, they should be in the SQL type's corresponding value type.
      // We should not see values in the user-defined class at here.
      // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is
      // an ArrayData at here, instead of a Vector.
      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v)

      case (ArrayType(ty, _), v: ArrayData) =>
        gen.writeStartArray()
        v.foreach(ty, (_, value) => valWriter(ty, value))
        gen.writeEndArray()

      case (MapType(kt, vt, _), v: MapData) =>
        gen.writeStartObject()
        v.foreach(kt, vt, { (k, v) =>
          gen.writeFieldName(k.toString)
          valWriter(vt, v)
        })
        gen.writeEndObject()

      case (StructType(ty), v: InternalRow) =>
        gen.writeStartObject()
        var i = 0
        while (i < ty.length) {
          val field = ty(i)
          val value = v.get(i, field.dataType)
          if (value != null) {
            gen.writeFieldName(field.name)
            valWriter(field.dataType, value)
          }
          i += 1
        }
        gen.writeEndObject()

      case (dt, v) =>
        sys.error(
          s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.")
    }

    valWriter(rowSchema, row)
  }
}

Example 8

Source File: KinesisRecordToUnsafeRowConverter.scala From kinesis-sql with Apache License 2.0

5 votes

package org.apache.spark.sql.kinesis

import com.amazonaws.services.kinesis.model.Record

import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.unsafe.types.UTF8String

private[kinesis] class KinesisRecordToUnsafeRowConverter {
  private val rowWriter = new UnsafeRowWriter(5)

  def toUnsafeRow(record: Record, streamName: String): UnsafeRow = {
    rowWriter.reset()
    rowWriter.write(0, record.getData.array())
    rowWriter.write(1, UTF8String.fromString(streamName))
    rowWriter.write(2, UTF8String.fromString(record.getPartitionKey))
    rowWriter.write(3, UTF8String.fromString(record.getSequenceNumber))
    rowWriter.write(4, DateTimeUtils.fromJavaTimestamp(
      new java.sql.Timestamp(record.getApproximateArrivalTimestamp.getTime)))
    rowWriter.getRow
  }
}

Example 9

Source File: UnivocityGenerator.scala From mimir with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.ubodin.csv

import java.io.Writer

import com.univocity.parsers.csv.CsvWriter

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._

private[csv] class UnivocityGenerator(
    schema: StructType,
    writer: Writer,
    options: CSVOptions) {
  private val writerSettings = options.asWriterSettings
  writerSettings.setHeaders(schema.fieldNames: _*)
  private val gen = new CsvWriter(writer, writerSettings)
  private var printHeader = options.headerFlag

  // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`.
  // When the value is null, this converter should not be called.
  private type ValueConverter = (InternalRow, Int) => String

  // `ValueConverter`s for all values in the fields of the schema
  private val valueConverters: Array[ValueConverter] =
    schema.map(_.dataType).map(makeConverter).toArray

  private def makeConverter(dataType: DataType): ValueConverter = dataType match {
    case DateType =>
      (row: InternalRow, ordinal: Int) =>
        options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal)))

    case TimestampType =>
      (row: InternalRow, ordinal: Int) =>
        options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal)))

    case udt: UserDefinedType[_] => makeConverter(udt.sqlType)

    case dt: DataType =>
      (row: InternalRow, ordinal: Int) =>
        row.get(ordinal, dt).toString
  }

  private def convertRow(row: InternalRow): Seq[String] = {
    var i = 0
    val values = new Array[String](row.numFields)
    while (i < row.numFields) {
      if (!row.isNullAt(i)) {
        values(i) = valueConverters(i).apply(row, i)
      } else {
        values(i) = options.nullValue
      }
      i += 1
    }
    values
  }

  
  def write(row: InternalRow): Unit = {
    if (printHeader) {
      gen.writeHeaders()
    }
    gen.writeRow(convertRow(row): _*)
    printHeader = false
  }

  def close(): Unit = gen.close()

  def flush(): Unit = gen.flush()
}

Example 10

Source File: ResolvedDataSourceSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.test.SharedSQLContext

class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext {
  private def getProvidingClass(name: String): Class[_] =
    DataSource(
      sparkSession = spark,
      className = name,
      options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID)
    ).providingClass

  test("jdbc") {
    assert(
      getProvidingClass("jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.jdbc") ===
        classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
  }

  test("json") {
    assert(
      getProvidingClass("json") ===
      classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
  }

  test("parquet") {
    assert(
      getProvidingClass("parquet") ===
      classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
  }

  test("csv") {
    assert(
      getProvidingClass("csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
    assert(
      getProvidingClass("com.databricks.spark.csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
  }

  test("error message for unknown data sources") {
    val error1 = intercept[AnalysisException] {
      getProvidingClass("avro")
    }
    assert(error1.getMessage.contains("Failed to find data source: avro."))

    val error2 = intercept[AnalysisException] {
      getProvidingClass("com.databricks.spark.avro")
    }
    assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro."))

    val error3 = intercept[ClassNotFoundException] {
      getProvidingClass("asfdwefasdfasdf")
    }
    assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf."))
  }
}

Example 11

Source File: ArrowUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.arrow

import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._

class ArrowUtilsSuite extends SparkFunSuite {

  def roundtrip(dt: DataType): Unit = {
    dt match {
      case schema: StructType =>
        assert(ArrowUtils.fromArrowSchema(ArrowUtils.toArrowSchema(schema, null)) === schema)
      case _ =>
        roundtrip(new StructType().add("value", dt))
    }
  }

  test("simple") {
    roundtrip(BooleanType)
    roundtrip(ByteType)
    roundtrip(ShortType)
    roundtrip(IntegerType)
    roundtrip(LongType)
    roundtrip(FloatType)
    roundtrip(DoubleType)
    roundtrip(StringType)
    roundtrip(BinaryType)
    roundtrip(DecimalType.SYSTEM_DEFAULT)
    roundtrip(DateType)
    val tsExMsg = intercept[UnsupportedOperationException] {
      roundtrip(TimestampType)
    }
    assert(tsExMsg.getMessage.contains("timeZoneId"))
  }

  test("timestamp") {

    def roundtripWithTz(timeZoneId: String): Unit = {
      val schema = new StructType().add("value", TimestampType)
      val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
      val fieldType = arrowSchema.findField("value").getType.asInstanceOf[ArrowType.Timestamp]
      assert(fieldType.getTimezone() === timeZoneId)
      assert(ArrowUtils.fromArrowSchema(arrowSchema) === schema)
    }

    roundtripWithTz(DateTimeUtils.defaultTimeZone().getID)
    roundtripWithTz("Asia/Tokyo")
    roundtripWithTz("UTC")
    roundtripWithTz("America/Los_Angeles")
  }

  test("array") {
    roundtrip(ArrayType(IntegerType, containsNull = true))
    roundtrip(ArrayType(IntegerType, containsNull = false))
    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = true))
    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = true))
    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = false))
    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = false))
  }

  test("struct") {
    roundtrip(new StructType())
    roundtrip(new StructType().add("i", IntegerType))
    roundtrip(new StructType().add("arr", ArrayType(IntegerType)))
    roundtrip(new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType)))
    roundtrip(new StructType().add(
      "struct",
      new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType))))
  }
}

Example 12

Source File: UnivocityGenerator.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.csv

import java.io.Writer

import com.univocity.parsers.csv.CsvWriter

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._

private[csv] class UnivocityGenerator(
    schema: StructType,
    writer: Writer,
    options: CSVOptions) {
  private val writerSettings = options.asWriterSettings
  writerSettings.setHeaders(schema.fieldNames: _*)
  private val gen = new CsvWriter(writer, writerSettings)
  private var printHeader = options.headerFlag

  // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`.
  // When the value is null, this converter should not be called.
  private type ValueConverter = (InternalRow, Int) => String

  // `ValueConverter`s for all values in the fields of the schema
  private val valueConverters: Array[ValueConverter] =
    schema.map(_.dataType).map(makeConverter).toArray

  private def makeConverter(dataType: DataType): ValueConverter = dataType match {
    case DateType =>
      (row: InternalRow, ordinal: Int) =>
        options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal)))

    case TimestampType =>
      (row: InternalRow, ordinal: Int) =>
        options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal)))

    case udt: UserDefinedType[_] => makeConverter(udt.sqlType)

    case dt: DataType =>
      (row: InternalRow, ordinal: Int) =>
        row.get(ordinal, dt).toString
  }

  private def convertRow(row: InternalRow): Seq[String] = {
    var i = 0
    val values = new Array[String](row.numFields)
    while (i < row.numFields) {
      if (!row.isNullAt(i)) {
        values(i) = valueConverters(i).apply(row, i)
      } else {
        values(i) = options.nullValue
      }
      i += 1
    }
    values
  }

  
  def write(row: InternalRow): Unit = {
    if (printHeader) {
      gen.writeHeaders()
    }
    gen.writeRow(convertRow(row): _*)
    printHeader = false
  }

  def close(): Unit = gen.close()

  def flush(): Unit = gen.flush()
}

Example 13

Source File: ComputeCurrentTimeSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.catalyst.util.DateTimeUtils

class ComputeCurrentTimeSuite extends PlanTest {
  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime))
  }

  test("analyzer should replace current_timestamp with literals") {
    val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()),
      LocalRelation())

    val min = System.currentTimeMillis() * 1000
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = (System.currentTimeMillis() + 1) * 1000

    val lits = new scala.collection.mutable.ArrayBuffer[Long]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Long]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }

  test("analyzer should replace current_date with literals") {
    val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation())

    val min = DateTimeUtils.millisToDays(System.currentTimeMillis())
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = DateTimeUtils.millisToDays(System.currentTimeMillis())

    val lits = new scala.collection.mutable.ArrayBuffer[Int]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Int]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }
}

Example 14

Source File: JacksonGeneratorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.json

import java.io.CharArrayWriter

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
import org.apache.spark.sql.types._

class JacksonGeneratorSuite extends SparkFunSuite {

  val gmtId = DateTimeUtils.TimeZoneGMT.getID
  val option = new JSONOptions(Map.empty, gmtId)

  test("initial with StructType and write out a row") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = InternalRow(1)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """{"a":1}""")
  }

  test("initial with StructType and write out rows") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = new GenericArrayData(InternalRow(1) :: InternalRow(2) :: Nil)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """[{"a":1},{"a":2}]""")
  }

  test("initial with StructType and write out an array with single empty row") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = new GenericArrayData(InternalRow(null) :: Nil)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """[{}]""")
  }

  test("initial with StructType and write out an empty array") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = new GenericArrayData(Nil)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """[]""")
  }

  test("initial with Map and write out a map data") {
    val dataType = MapType(StringType, IntegerType)
    val input = ArrayBasedMapData(Map("a" -> 1))
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """{"a":1}""")
  }

  test("initial with Map and write out an array of maps") {
    val dataType = MapType(StringType, IntegerType)
    val input = new GenericArrayData(
      ArrayBasedMapData(Map("a" -> 1)) :: ArrayBasedMapData(Map("b" -> 2)) :: Nil)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """[{"a":1},{"b":2}]""")
  }

  test("error handling: initial with StructType but error calling write a map") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = ArrayBasedMapData(Map("a" -> 1))
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    intercept[UnsupportedOperationException] {
      gen.write(input)
    }
  }

  test("error handling: initial with MapType and write out a row") {
    val dataType = MapType(StringType, IntegerType)
    val input = InternalRow(1)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    intercept[UnsupportedOperationException] {
      gen.write(input)
    }
  }

}

Example 15

Source File: LitTests.scala From frameless with Apache License 2.0

5 votes

package frameless

import frameless.functions.lit

import org.scalacheck.Prop
import org.scalacheck.Prop._

class LitTests extends TypedDatasetSuite {
  def prop[A: TypedEncoder](value: A): Prop = {
    val df: TypedDataset[Int] = TypedDataset.create(1 :: Nil)

    // filter forces whole codegen
    val elems = df.deserialized.filter((_:Int) => true).select(lit(value))
      .collect()
      .run()
      .toVector

    // otherwise it uses local relation
    val localElems = df.select(lit(value))
      .collect()
      .run()
      .toVector


    (localElems ?= Vector(value)) && (elems ?= Vector(value))
  }

  test("select(lit(...))") {
    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)

    check(prop[Option[Int]] _)
    check(prop[Option[String]] _)

    check(prop[Vector[Long]] _)
    check(prop[Vector[X1[Long]]] _)

    check(prop[Vector[String]] _)
    check(prop[Vector[X1[String]]] _)

    check(prop[X1[Int]] _)
    check(prop[X1[X1[Int]]] _)

    check(prop[Food] _)

    // doesn't work, object has to be serializable
    // check(prop[frameless.LocalDateTime] _)
  }

  test("#205: comparing literals encoded using Injection") {
    import org.apache.spark.sql.catalyst.util.DateTimeUtils
    implicit val dateAsInt: Injection[java.sql.Date, Int] =
      Injection(DateTimeUtils.fromJavaDate, DateTimeUtils.toJavaDate)

    val today = new java.sql.Date(System.currentTimeMillis)
    val data = Vector(P(42, today))
    val tds = TypedDataset.create(data)

    tds.filter(tds('d) === today).collect().run()
  }
}

final case class P(i: Int, d: java.sql.Date)

Example 16

Source File: ComputeCurrentTimeSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.catalyst.util.DateTimeUtils

class ComputeCurrentTimeSuite extends PlanTest {
  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime))
  }

  test("analyzer should replace current_timestamp with literals") {
    val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()),
      LocalRelation())

    val min = System.currentTimeMillis() * 1000
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = (System.currentTimeMillis() + 1) * 1000

    val lits = new scala.collection.mutable.ArrayBuffer[Long]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Long]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }

  test("analyzer should replace current_date with literals") {
    val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation())

    val min = DateTimeUtils.millisToDays(System.currentTimeMillis())
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = DateTimeUtils.millisToDays(System.currentTimeMillis())

    val lits = new scala.collection.mutable.ArrayBuffer[Int]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Int]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }
}

Example 17

Source File: MergeProjection.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.mutation.merge

import java.sql.{Date, Timestamp}

import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection}
import org.apache.spark.sql.catalyst.util.DateTimeUtils


case class MergeProjection(
    @transient tableCols: Seq[String],
    @transient statusCol : String,
    @transient ds: Dataset[Row],
    @transient rltn: CarbonDatasourceHadoopRelation,
    @transient sparkSession: SparkSession,
    @transient mergeAction: MergeAction) {

  private val cutOffDate = Integer.MAX_VALUE >> 1

  val isUpdate = mergeAction.isInstanceOf[UpdateAction]
  val isDelete = mergeAction.isInstanceOf[DeleteAction]

  def apply(row: GenericRowWithSchema): InternalRow = {
    // TODO we can avoid these multiple conversions if this is added as a SparkPlan node.
    val values = row.values.map {
      case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s)
      case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d)
      case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b)
      case d: Date => DateTimeUtils.fromJavaDate(d)
      case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
      case value => value
    }

    projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow]
  }

  val (projection, output) = generateProjection

  private def generateProjection: (Projection, Array[Expression]) = {
    val existingDsOutput = rltn.carbonRelation.schema.toAttributes
    val colsMap = mergeAction match {
      case UpdateAction(updateMap) => updateMap
      case InsertAction(insertMap) => insertMap
      case _ => null
    }
    if (colsMap != null) {
      val output = new Array[Expression](tableCols.length)
      val expecOutput = new Array[Expression](tableCols.length)
      colsMap.foreach { case (k, v) =>
        val tableIndex = tableCols.indexOf(k.toString().toLowerCase)
        if (tableIndex < 0) {
          throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap")
        }
        output(tableIndex) = v.expr.transform {
          case a: Attribute if !a.resolved =>
            ds.queryExecution.analyzed.resolveQuoted(a.name,
              sparkSession.sessionState.analyzer.resolver).get
        }
        expecOutput(tableIndex) =
          existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get
      }
      if (output.contains(null)) {
        throw new CarbonMergeDataSetException(s"Not all columns are mapped")
      }
      (new InterpretedMutableProjection(output++Seq(
        ds.queryExecution.analyzed.resolveQuoted(statusCol,
        sparkSession.sessionState.analyzer.resolver).get),
        ds.queryExecution.analyzed.output), expecOutput)
    } else {
      (null, null)
    }
  }
}

Example 18

Source File: ComputeCurrentTimeSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.catalyst.util.DateTimeUtils

class ComputeCurrentTimeSuite extends PlanTest {
  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime))
  }

  test("analyzer should replace current_timestamp with literals") {
    val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()),
      LocalRelation())

    val min = System.currentTimeMillis() * 1000
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = (System.currentTimeMillis() + 1) * 1000

    val lits = new scala.collection.mutable.ArrayBuffer[Long]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Long]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }

  test("analyzer should replace current_date with literals") {
    val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation())

    val min = DateTimeUtils.millisToDays(System.currentTimeMillis())
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = DateTimeUtils.millisToDays(System.currentTimeMillis())

    val lits = new scala.collection.mutable.ArrayBuffer[Int]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Int]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }
}

Example 19

Source File: ColumnarLiterals.scala From OAP with Apache License 2.0

5 votes

package com.intel.sparkColumnarPlugin.expression

import com.google.common.collect.Lists
import org.apache.arrow.gandiva.evaluator._
import org.apache.arrow.gandiva.exceptions.GandivaException
import org.apache.arrow.gandiva.expression._
import org.apache.arrow.vector.types.pojo.ArrowType
import org.apache.arrow.vector.types.pojo.Field
import org.apache.arrow.vector.types.DateUnit

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._

import scala.collection.mutable.ListBuffer

class ColumnarLiteral(lit: Literal)
    extends Literal(lit.value, lit.dataType)
    with ColumnarExpression {

  override def doColumnarCodeGen(args: java.lang.Object): (TreeNode, ArrowType) = {
    val resultType = CodeGeneration.getResultType(dataType)
    dataType match {
      case t: StringType =>
        (TreeBuilder.makeStringLiteral(value.toString().asInstanceOf[String]), resultType)
      case t: IntegerType =>
        (TreeBuilder.makeLiteral(value.asInstanceOf[Integer]), resultType)
      case t: LongType =>
        (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Long]), resultType)
      case t: DoubleType =>
        (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Double]), resultType)
      case d: DecimalType =>
        val v = value.asInstanceOf[Decimal]
        (TreeBuilder.makeDecimalLiteral(v.toString, v.precision, v.scale), resultType)
      case d: DateType =>
        val origIntNode = TreeBuilder.makeLiteral(value.asInstanceOf[Integer])
        val dateNode = TreeBuilder.makeFunction("castDATE", Lists.newArrayList(origIntNode), new ArrowType.Date(DateUnit.DAY))
        (dateNode, new ArrowType.Date(DateUnit.DAY))
      case b: BooleanType =>
        (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Boolean]), resultType)
    }
  }
}

Example 20

Source File: TimestampExpressionSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import java.sql.Timestamp

import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types.{DateType, IntegerType}
import org.scalatest.FunSuite

class TimestampExpressionSuite extends FunSuite with ExpressionEvalHelper {

  test("add_seconds") {
    // scalastyle:off magic.number
    checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-01 00:11:33")), Literal(28)),
      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-01-01 00:12:01")))
    checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-02 00:00:00")), Literal(-1)),
      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-01-01 23:59:59")))
    checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-01 00:00:00")), Literal(-1)),
      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2014-12-31 23:59:59")))
    checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-02 00:00:00")),
      Literal.create(null, IntegerType)), null)
    checkEvaluation(AddSeconds(Literal.create(null, DateType), Literal(1)), null)
    checkEvaluation(AddSeconds(Literal.create(null, DateType), Literal.create(null, IntegerType)),
      null)
  }
}

Example 21

Source File: AddSeconds.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.util.DateTimeUtils


case class AddSeconds(timestamp: Expression, seconds: Expression)
  extends BinaryExpression
  with ImplicitCastInputTypes
  with CodegenFallback {

  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, IntegerType)

  override def nullSafeEval(microseconds: Any, seconds: Any): Any = {
    microseconds.asInstanceOf[DateTimeUtils.SQLTimestamp] +
      (seconds.asInstanceOf[Int] * DateTimeUtils.MICROS_PER_SECOND)
  }

  override def left: Expression = timestamp
  override def right: Expression = seconds
  override def dataType: DataType = TimestampType
}

Example 22

Source File: dateExpressions.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._


case class AddYears(date: Expression, years: Expression)
  extends BinaryExpression
  with ImplicitCastInputTypes with CodegenFallback {

  override def inputTypes: Seq[AbstractDataType] = Seq(DateType, IntegerType)

  override def nullSafeEval(d: Any, y: Any): Any = {
    DateTimeUtils.dateAddMonths(
      d.asInstanceOf[DateTimeUtils.SQLDate], y.asInstanceOf[Int] * 12
    )
  }

  override def left: Expression = date
  override def right: Expression = years
  override def dataType: DataType = DateType
}

Example 23

Source File: ResolvedDataSourceSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.test.SharedSQLContext

class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext {
  private def getProvidingClass(name: String): Class[_] =
    DataSource(
      sparkSession = spark,
      className = name,
      options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID)
    ).providingClass

  test("jdbc") {
    assert(
      getProvidingClass("jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.jdbc") ===
        classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
  }

  test("json") {
    assert(
      getProvidingClass("json") ===
      classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
  }

  test("parquet") {
    assert(
      getProvidingClass("parquet") ===
      classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
  }

  test("csv") {
    assert(
      getProvidingClass("csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
    assert(
      getProvidingClass("com.databricks.spark.csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
  }

  test("avro: show deploy guide for loading the external avro module") {
    Seq("avro", "org.apache.spark.sql.avro").foreach { provider =>
      val message = intercept[AnalysisException] {
        getProvidingClass(provider)
      }.getMessage
      assert(message.contains(s"Failed to find data source: $provider"))
      assert(message.contains("Please deploy the application as per the deployment section of"))
    }
  }

  test("kafka: show deploy guide for loading the external kafka module") {
    val message = intercept[AnalysisException] {
      getProvidingClass("kafka")
    }.getMessage
    assert(message.contains("Failed to find data source: kafka"))
    assert(message.contains("Please deploy the application as per the deployment section of"))
  }

  test("error message for unknown data sources") {
    val error = intercept[ClassNotFoundException] {
      getProvidingClass("asfdwefasdfasdf")
    }
    assert(error.getMessage.contains("Failed to find data source: asfdwefasdfasdf."))
  }
}

Example 24

Source File: ArrowUtilsSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.arrow

import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._

class ArrowUtilsSuite extends SparkFunSuite {

  def roundtrip(dt: DataType): Unit = {
    dt match {
      case schema: StructType =>
        assert(ArrowUtils.fromArrowSchema(ArrowUtils.toArrowSchema(schema, null)) === schema)
      case _ =>
        roundtrip(new StructType().add("value", dt))
    }
  }

  test("simple") {
    roundtrip(BooleanType)
    roundtrip(ByteType)
    roundtrip(ShortType)
    roundtrip(IntegerType)
    roundtrip(LongType)
    roundtrip(FloatType)
    roundtrip(DoubleType)
    roundtrip(StringType)
    roundtrip(BinaryType)
    roundtrip(DecimalType.SYSTEM_DEFAULT)
    roundtrip(DateType)
    val tsExMsg = intercept[UnsupportedOperationException] {
      roundtrip(TimestampType)
    }
    assert(tsExMsg.getMessage.contains("timeZoneId"))
  }

  test("timestamp") {

    def roundtripWithTz(timeZoneId: String): Unit = {
      val schema = new StructType().add("value", TimestampType)
      val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
      val fieldType = arrowSchema.findField("value").getType.asInstanceOf[ArrowType.Timestamp]
      assert(fieldType.getTimezone() === timeZoneId)
      assert(ArrowUtils.fromArrowSchema(arrowSchema) === schema)
    }

    roundtripWithTz(DateTimeUtils.defaultTimeZone().getID)
    roundtripWithTz("Asia/Tokyo")
    roundtripWithTz("UTC")
    roundtripWithTz("America/Los_Angeles")
  }

  test("array") {
    roundtrip(ArrayType(IntegerType, containsNull = true))
    roundtrip(ArrayType(IntegerType, containsNull = false))
    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = true))
    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = true))
    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = false))
    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = false))
  }

  test("struct") {
    roundtrip(new StructType())
    roundtrip(new StructType().add("i", IntegerType))
    roundtrip(new StructType().add("arr", ArrayType(IntegerType)))
    roundtrip(new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType)))
    roundtrip(new StructType().add(
      "struct",
      new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType))))
  }
}

Example 25

Source File: MetricsReporter.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.text.SimpleDateFormat

import com.codahale.metrics.{Gauge, MetricRegistry}

import org.apache.spark.internal.Logging
import org.apache.spark.metrics.source.{Source => CodahaleSource}
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.streaming.StreamingQueryProgress


class MetricsReporter(
    stream: StreamExecution,
    override val sourceName: String) extends CodahaleSource with Logging {

  override val metricRegistry: MetricRegistry = new MetricRegistry

  // Metric names should not have . in them, so that all the metrics of a query are identified
  // together in Ganglia as a single metric group
  registerGauge("inputRate-total", _.inputRowsPerSecond, 0.0)
  registerGauge("processingRate-total", _.processedRowsPerSecond, 0.0)
  registerGauge("latency", _.durationMs.get("triggerExecution").longValue(), 0L)

  private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601
  timestampFormat.setTimeZone(DateTimeUtils.getTimeZone("UTC"))

  registerGauge("eventTime-watermark",
    progress => convertStringDateToMillis(progress.eventTime.get("watermark")), 0L)

  registerGauge("states-rowsTotal", _.stateOperators.map(_.numRowsTotal).sum, 0L)
  registerGauge("states-usedBytes", _.stateOperators.map(_.memoryUsedBytes).sum, 0L)

  private def convertStringDateToMillis(isoUtcDateStr: String) = {
    if (isoUtcDateStr != null) {
      timestampFormat.parse(isoUtcDateStr).getTime
    } else {
      0L
    }
  }

  private def registerGauge[T](
      name: String,
      f: StreamingQueryProgress => T,
      default: T): Unit = {
    synchronized {
      metricRegistry.register(name, new Gauge[T] {
        override def getValue: T = Option(stream.lastProgress).map(f).getOrElse(default)
      })
    }
  }
}

Example 26

Source File: UnivocityGenerator.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.csv

import java.io.Writer

import com.univocity.parsers.csv.CsvWriter

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._

private[csv] class UnivocityGenerator(
    schema: StructType,
    writer: Writer,
    options: CSVOptions) {
  private val writerSettings = options.asWriterSettings
  writerSettings.setHeaders(schema.fieldNames: _*)
  private val gen = new CsvWriter(writer, writerSettings)
  private var printHeader = options.headerFlag

  // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`.
  // When the value is null, this converter should not be called.
  private type ValueConverter = (InternalRow, Int) => String

  // `ValueConverter`s for all values in the fields of the schema
  private val valueConverters: Array[ValueConverter] =
    schema.map(_.dataType).map(makeConverter).toArray

  private def makeConverter(dataType: DataType): ValueConverter = dataType match {
    case DateType =>
      (row: InternalRow, ordinal: Int) =>
        options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal)))

    case TimestampType =>
      (row: InternalRow, ordinal: Int) =>
        options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal)))

    case udt: UserDefinedType[_] => makeConverter(udt.sqlType)

    case dt: DataType =>
      (row: InternalRow, ordinal: Int) =>
        row.get(ordinal, dt).toString
  }

  private def convertRow(row: InternalRow): Seq[String] = {
    var i = 0
    val values = new Array[String](row.numFields)
    while (i < row.numFields) {
      if (!row.isNullAt(i)) {
        values(i) = valueConverters(i).apply(row, i)
      } else {
        values(i) = options.nullValue
      }
      i += 1
    }
    values
  }

  
  def write(row: InternalRow): Unit = {
    if (printHeader) {
      gen.writeHeaders()
    }
    gen.writeRow(convertRow(row): _*)
    printHeader = false
  }

  def close(): Unit = gen.close()

  def flush(): Unit = gen.flush()
}

Example 27

Source File: ComputeCurrentTimeSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.catalyst.util.DateTimeUtils

class ComputeCurrentTimeSuite extends PlanTest {
  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime))
  }

  test("analyzer should replace current_timestamp with literals") {
    val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()),
      LocalRelation())

    val min = System.currentTimeMillis() * 1000
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = (System.currentTimeMillis() + 1) * 1000

    val lits = new scala.collection.mutable.ArrayBuffer[Long]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Long]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }

  test("analyzer should replace current_date with literals") {
    val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation())

    val min = DateTimeUtils.millisToDays(System.currentTimeMillis())
    val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
    val max = DateTimeUtils.millisToDays(System.currentTimeMillis())

    val lits = new scala.collection.mutable.ArrayBuffer[Int]
    plan.transformAllExpressions { case e: Literal =>
      lits += e.value.asInstanceOf[Int]
      e
    }
    assert(lits.size == 2)
    assert(lits(0) >= min && lits(0) <= max)
    assert(lits(1) >= min && lits(1) <= max)
    assert(lits(0) == lits(1))
  }
}

Example 28

Source File: JacksonGeneratorSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.json

import java.io.CharArrayWriter

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
import org.apache.spark.sql.types._

class JacksonGeneratorSuite extends SparkFunSuite {

  val gmtId = DateTimeUtils.TimeZoneGMT.getID
  val option = new JSONOptions(Map.empty, gmtId)

  test("initial with StructType and write out a row") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = InternalRow(1)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """{"a":1}""")
  }

  test("initial with StructType and write out rows") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = new GenericArrayData(InternalRow(1) :: InternalRow(2) :: Nil)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """[{"a":1},{"a":2}]""")
  }

  test("initial with StructType and write out an array with single empty row") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = new GenericArrayData(InternalRow(null) :: Nil)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """[{}]""")
  }

  test("initial with StructType and write out an empty array") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = new GenericArrayData(Nil)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """[]""")
  }

  test("initial with Map and write out a map data") {
    val dataType = MapType(StringType, IntegerType)
    val input = ArrayBasedMapData(Map("a" -> 1))
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """{"a":1}""")
  }

  test("initial with Map and write out an array of maps") {
    val dataType = MapType(StringType, IntegerType)
    val input = new GenericArrayData(
      ArrayBasedMapData(Map("a" -> 1)) :: ArrayBasedMapData(Map("b" -> 2)) :: Nil)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    gen.write(input)
    gen.flush()
    assert(writer.toString === """[{"a":1},{"b":2}]""")
  }

  test("error handling: initial with StructType but error calling write a map") {
    val dataType = StructType(StructField("a", IntegerType) :: Nil)
    val input = ArrayBasedMapData(Map("a" -> 1))
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    intercept[UnsupportedOperationException] {
      gen.write(input)
    }
  }

  test("error handling: initial with MapType and write out a row") {
    val dataType = MapType(StringType, IntegerType)
    val input = InternalRow(1)
    val writer = new CharArrayWriter()
    val gen = new JacksonGenerator(dataType, writer, option)
    intercept[UnsupportedOperationException] {
      gen.write(input)
    }
  }

}