org.apache.spark.sql.types._ Scala Examples

The following examples show how to use org.apache.spark.sql.types._. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MiscStatement.scala    From spark-snowflake   with Apache License 2.0 5 votes vote down vote up
package net.snowflake.spark.snowflake.pushdowns.querygeneration

import net.snowflake.spark.snowflake.{
  ConstantString,
  EmptySnowflakeSQLStatement,
  IntVariable,
  SnowflakeSQLStatement
}
import org.apache.spark.sql.catalyst.expressions.{
  Alias,
  Ascending,
  Attribute,
  Cast,
  DenseRank,
  Descending,
  Expression,
  If,
  In,
  InSet,
  Literal,
  MakeDecimal,
  PercentRank,
  Rank,
  ScalarSubquery,
  ShiftLeft,
  ShiftRight,
  SortOrder,
  UnscaledValue,
  WindowExpression,
  WindowSpecDefinition
}
import org.apache.spark.sql.types.{Decimal, _}


  private[querygeneration] final def getCastType(t: DataType): Option[String] =
    Option(t match {
      case StringType => "VARCHAR"
      case BinaryType => "BINARY"
      case DateType => "DATE"
      case TimestampType => "TIMESTAMP"
      case d: DecimalType =>
        "DECIMAL(" + d.precision + ", " + d.scale + ")"
      case IntegerType | LongType => "NUMBER"
      case FloatType => "FLOAT"
      case DoubleType => "DOUBLE"
      case _ => null
    })
} 
Example 2
Source File: KustoResponseDeserializer.scala    From azure-kusto-spark   with Apache License 2.0 5 votes vote down vote up
package com.microsoft.kusto.spark.datasource

import java.sql.Timestamp
import java.util

import com.microsoft.azure.kusto.data.{KustoResultColumn, KustoResultSetTable, Results}
import com.microsoft.kusto.spark.utils.DataTypeMapping
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StructType, _}
import org.joda.time.DateTime

import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

object KustoResponseDeserializer {
  def apply(kustoResult: KustoResultSetTable): KustoResponseDeserializer = new KustoResponseDeserializer(kustoResult)
}

// Timespan columns are casted to strings in kusto side. A simple test to compare the translation to a Duration string
// in the format of timespan resulted in less performance. One way was using a new expression that extends UnaryExpression,
// second was by a udf function, both were less performant.
case class KustoSchema(sparkSchema: StructType, toStringCastedColumns: Set[String])

class KustoResponseDeserializer(val kustoResult: KustoResultSetTable) {
  val schema: KustoSchema = getSchemaFromKustoResult

  private def getValueTransformer(valueType: String): Any => Any = {

    valueType.toLowerCase() match {
      case "string" => value: Any => value
      case "int64" => value: Any => value
      case "datetime" => value: Any => new Timestamp(new DateTime(value).getMillis)
      case "timespan" => value: Any => value
      case "sbyte" => value: Any => value
      case "long" => value: Any => value match {
        case i: Int => i.toLong
        case _ => value.asInstanceOf[Long]
      }
      case "double" => value: Any => value
      case "decimal" => value: Any => BigDecimal(value.asInstanceOf[String])
      case "int" => value: Any => value
      case "int32" => value: Any => value
      case "bool" => value: Any => value
      case "real" => value: Any => value
      case _ => value: Any => value.toString
      }
  }

   private def getSchemaFromKustoResult: KustoSchema = {
    if (kustoResult.getColumns.isEmpty) {
      KustoSchema(StructType(List()), Set())
    } else {
      val columns = kustoResult.getColumns

      KustoSchema(StructType(columns.map(col => StructField(col.getColumnName,
            DataTypeMapping.kustoTypeToSparkTypeMap.getOrElse(col.getColumnType.toLowerCase, StringType)))),
        columns.filter(c => c.getColumnType.equalsIgnoreCase("TimeSpan")).map(c => c.getColumnName).toSet)
    }
  }

  def getSchema: KustoSchema = { schema }

  def toRows: java.util.List[Row] = {
    val columnInOrder = kustoResult.getColumns
    val value: util.ArrayList[Row] = new util.ArrayList[Row](kustoResult.count())

//     Calculate the transformer function for each column to use later by order
    val valueTransformers: mutable.Seq[Any => Any] = columnInOrder.map(col => getValueTransformer(col.getColumnType))
    kustoResult.getData.asScala.foreach(row => {
      val genericRow = row.toArray().zipWithIndex.map(
        column => {
          if (column._1 == null) null else valueTransformers(column._2)(column._1)
        })
      value.add(new GenericRowWithSchema(genericRow, schema.sparkSchema))
    })

    value
  }

//  private def getOrderedColumnName = {
//    val columnInOrder = ArrayBuffer.fill(kustoResult.getColumnNameToIndex.size()){ "" }
//    kustoResult.getColumns.foreach((columnIndexPair: KustoResultColumn) => columnInOrder(columnIndexPair.) = columnIndexPair._1)
//    columnInOrder
//  }
} 
Example 3
Source File: XGBoostBigModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml


import com.cloudera.sparkts.models.UberXGBoostModel
import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint}
import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)])
    extends ForecastBaseModel[XGBoostBigModel[I]]
    with HasLabelCol
    with HasIdCol {

  def setLabelcol(label: String): this.type = set(labelCol, label)

  def setIdcol(id: String): this.type = set(idCol, id)

  override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
            )
      }
      .join(prediction)
      .map {
        case (id, (features, predictValue)) =>
          Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }

  protected def predict(dataSet: Dataset[_]) = {
    val features = dataSet.rdd.map { case (row: Row) =>
      val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
      val id = row.getAs[I]($(idCol))
      SparkLabeledPoint(DataTransformer.toFloat(id), features)
    }.cache
    val (_, model) = models.head
    UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(getPredictionSchema)

  protected def getPredictionSchema: Array[StructField] = {
    Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    )
  }
} 
Example 4
Source File: XGBoostBigModelTimeSeries.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import java.sql.Timestamp

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasTimeCol
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModelTimeSeries[I](override val uid: String,
                                   override val models: Seq[(ParamMap, XGBoostModel)])
                                  extends XGBoostBigModel[I](uid, models) with HasTimeCol{

  def setTimecol(time: String): this.type = set(timeCol, Some(time))

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME),
              row.getAs[java.sql.Timestamp]($(timeCol).get)))
      }
      .join(prediction)
      .map {
        case (id, ((features, time), predictValue)) =>
          Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }


  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField($(timeCol).get, TimestampType),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    ) )
} 
Example 5
Source File: DruidPlannerHelper.scala    From spark-druid-olap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources.druid

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan}
import org.apache.spark.sql.types.{DecimalType, _}
import org.sparklinedata.druid.DruidOperatorAttribute

trait DruidPlannerHelper {

  def unalias(e: Expression, agg: Aggregate): Option[Expression] = {

    agg.aggregateExpressions.find { aE =>
      (aE, e) match {
        case _ if aE == e => true
        case (_, e:AttributeReference) if e.exprId == aE.exprId => true
        case (Alias(child, _), e) if child == e => true
        case _ => false
      }
    }.map {
      case Alias(child, _) => child
      case x => x
    }

  }

  def findAttribute(e: Expression): Option[AttributeReference] = {
    e.find(_.isInstanceOf[AttributeReference]).map(_.asInstanceOf[AttributeReference])
  }

  def positionOfAttribute(e: Expression,
                          plan: LogicalPlan): Option[(Expression, (AttributeReference, Int))] = {
    for (aR <- findAttribute(e);
         attr <- plan.output.zipWithIndex.find(t => t._1.exprId == aR.exprId))
      yield (e, (aR, attr._2))
  }

  def exprIdToAttribute(e: Expression,
                          plan: LogicalPlan): Option[(ExprId, Int)] = {
    for (aR <- findAttribute(e);
         attr <- plan.output.zipWithIndex.find(t => t._1.exprId == aR.exprId))
      yield (aR.exprId, attr._2)
  }

  
  case class GroupingInfo(gEs: Seq[Expression],
                          expandOpGExps : Seq[Expression],
                          aEs: Seq[NamedExpression],
                          expandOpProjection : Seq[Expression],
                          aEExprIdToPos : Map[ExprId, Int],
                          aEToLiteralExpr: Map[Expression, Expression] = Map())

  def isNumericType(dt : DataType) : Boolean = NumericType.acceptsType(dt)

} 
Example 6
Source File: RawCsvRDDToDataframe.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.csv

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.types.{StructType, _}


object RawCsvRDDToDataframe {

  def parse(
      rdd: RDD[String],
      sparkSession: SparkSession,
      options: Map[String, String]): DataFrame = {
    val csvOptions = MapToCsvOptions(options, sparkSession.sessionState.conf)
    val csvReader = SparkCsvReader.create(csvOptions)
    val firstLine = findFirstLine(csvOptions, rdd)
    val firstRow = csvReader.parseLine(firstLine)
    val header = if (csvOptions.headerFlag) {
      firstRow.zipWithIndex.map { case (value, index) =>
        if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value
      }
    } else {
      firstRow.zipWithIndex.map { case (value, index) => s"_c$index" }
    }

    val parsedRdd = tokenRdd(rdd, header, csvOptions)

    // TODO Migrate to Spark's schema inferencer eventually
    // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions)
    val schema = {
      val schemaFields = header.map { fieldName =>
        StructField(fieldName.toString, StringType, nullable = true)
      }
      StructType(schemaFields)
    }

    val ignoreMalformedRows = 0
    val internalRows = parsedRdd.flatMap { row =>
      val parser = CSVRelation.csvParser(schema, header, csvOptions)
      parser(row, ignoreMalformedRows)
    }

    Dataset.ofRows(
      sparkSession,
      LogicalRDD(
        schema.toAttributes,
        internalRows)(sparkSession))
  }

  private def tokenRdd(
      rdd: RDD[String],
      header: Array[String],
      options: CSVOptions): RDD[Array[String]] = {
    // Make sure firstLine is materialized before sending to executors
    val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null
    SparkCsvReader.univocityTokenizer(rdd, header, firstLine, options)
  }

  private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = {
    if (options.isCommentSet) {
      val comment = options.comment.toString
      rdd.filter { line =>
        line.trim.nonEmpty && !line.startsWith(comment)
      }.first()
    } else {
      rdd.filter { line =>
        line.trim.nonEmpty
      }.first()
    }
  }

} 
Example 7
Source File: RawCsvRDDToDataframe.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.csv

import scala.util.Try

import com.univocity.parsers.csv.CsvParser
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.types.{StructType, _}



object RawCsvRDDToDataframe {

  def parse(
      rdd: RDD[String],
      sparkSession: SparkSession,
      options: Map[String, String]): DataFrame = {
    val csvOptions = new CSVOptions(options, sparkSession.sessionState.conf.sessionLocalTimeZone)
    val csvReader = new CsvParser(csvOptions.asParserSettings)
    val firstLine = findFirstLine(csvOptions, rdd)
    val firstRow = csvReader.parseLine(firstLine)
    val header = if (csvOptions.headerFlag) {
      firstRow.zipWithIndex.map { case (value, index) =>
        if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value
      }
    } else {
      firstRow.zipWithIndex.map { case (value, index) => s"_c$index" }
    }

    // TODO Migrate to Spark's schema inferencer eventually
    // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions)
    val schema = {
      val schemaFields = header.map { fieldName =>
        StructField(fieldName.toString, StringType, nullable = true)
      }
      StructType(schemaFields)
    }

    val withoutHeader = if (csvOptions.headerFlag) {
      rdd.zipWithIndex()
        .filter { case (row, index) => index != 0 }
        .map { case (row, index) => row }
    }
    else {
      rdd
    }


    val internalRows = withoutHeader.filter(row => row.trim.nonEmpty)
      .flatMap { row =>
      val univocityParser = new UnivocityParser(schema, csvOptions)
      Try(univocityParser.parse(row)).toOption
    }

    Dataset.ofRows(
      sparkSession,
      LogicalRDD(
        schema.toAttributes,
        internalRows)(sparkSession))
  }

  private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = {
    if (options.isCommentSet) {
      val comment = options.comment.toString
      rdd.filter { line =>
        line.trim.nonEmpty && !line.startsWith(comment)
      }.first()
    } else {
      rdd.filter { line =>
        line.trim.nonEmpty
      }.first()
    }
  }

} 
Example 8
Source File: CustomSchemaTest.scala    From spark-sftp   with Apache License 2.0 5 votes vote down vote up
package com.springml.spark.sftp

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _}
import org.scalatest.{BeforeAndAfterEach, FunSuite}


class CustomSchemaTest extends FunSuite with BeforeAndAfterEach {
  var ss: SparkSession = _

  val csvTypesMap = Map("ProposalId" -> IntegerType,
    "OpportunityId" -> StringType,
    "Clicks" -> LongType,
    "Impressions" -> LongType
  )

  val jsonTypesMap = Map("name" -> StringType,
    "age" -> IntegerType
  )

  override def beforeEach() {
    ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate()
  }

  private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = {
    val expectedType = typeMap(field.name)
    assert(expectedType == field.dataType)
  }

  private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = {
    val columns = typeMap.map(x => new StructField(x._1, x._2, true))

    val columnStruct = Array[StructField] ()
    columns.copyToArray(columnStruct)

    columnStruct
  }

  test ("Read CSV with custom schema") {
    val columnStruct = columnArray(csvTypesMap)
    val expectedSchema = StructType(columnStruct)

    val fileLocation = getClass.getResource("/sample.csv").getPath
    val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
    val rdd = dsr.buildScan()

    assert(dsr.schema.fields.length == columnStruct.length)
    dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap))
  }

  test ("Read Json with custom schema") {
    val columnStruct = columnArray(jsonTypesMap)
    val expectedSchema = StructType(columnStruct)

    val fileLocation = getClass.getResource("/people.json").getPath
    val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
    val rdd = dsr.buildScan()

    assert(dsr.schema.fields.length == columnStruct.length)
    dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap))
  }

} 
Example 9
Source File: KafkaSink.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package kafka

import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{struct, to_json, _}
import _root_.log.LazyLogger
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.types.{StringType, _}
import radio.{SimpleSongAggregation, SimpleSongAggregationKafka}
import spark.SparkHelper

object KafkaSink extends LazyLogger {
  private val spark = SparkHelper.getSparkSession()

  import spark.implicits._

  def writeStream(staticInputDS: Dataset[SimpleSongAggregation]) : StreamingQuery = {
    log.warn("Writing to Kafka")
    staticInputDS
      .select(to_json(struct($"*")).cast(StringType).alias("value"))
      .writeStream
      .outputMode("update")
      .format("kafka")
      .option("kafka.bootstrap.servers", KafkaService.bootstrapServers)
      .queryName("Kafka - Count number of broadcasts for a title/artist by radio")
      .option("topic", "test")
      .start()
  }

  
  def debugStream(staticKafkaInputDS: Dataset[SimpleSongAggregationKafka]) = {
    staticKafkaInputDS
      .writeStream
      .queryName("Debug Stream Kafka")
      .format("console")
      .start()
  }
} 
Example 10
Source File: KafkaSource.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package kafka

import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{struct, to_json, _}
import _root_.log.LazyLogger
import org.apache.spark.sql.types.{StringType, _}
import radio.{SimpleSongAggregation, SimpleSongAggregationKafka}
import spark.SparkHelper


    def read(startingOption: String = "startingOffsets", partitionsAndOffsets: String = "earliest") : Dataset[SimpleSongAggregationKafka] = {
      log.warn("Reading from Kafka")

      spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", KafkaService.topicName)
      .option("enable.auto.commit", false) // Cannot be set to true in Spark Strucutured Streaming https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#kafka-specific-configurations
      .option("group.id", "Structured-Streaming-Examples")
      .option("failOnDataLoss", false) // when starting a fresh kafka (default location is temporary (/tmp) and cassandra is not (var/lib)), we have saved different offsets in Cassandra than real offsets in kafka (that contains nothing)
      .option(startingOption, partitionsAndOffsets) //this only applies when a new query is started and that resuming will always pick up from where the query left off
      .load()
      .withColumn(KafkaService.radioStructureName, // nested structure with our json
        from_json($"value".cast(StringType), KafkaService.schemaOutput) //From binary to JSON object
      ).as[SimpleSongAggregationKafka]
      .filter(_.radioCount != null) //TODO find a better way to filter bad json
  }
} 
Example 11
Source File: RawCsvRDDToDataframe.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.csv

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.types.{StructType, _}

import io.deepsense.sparkutils.SparkSQLSession


object RawCsvRDDToDataframe {

  def parse(
      rdd: RDD[String],
      sparkSQLSession: SparkSQLSession,
      options: Map[String, String]): DataFrame = {
    val csvOptions = new CSVOptions(options)
    val lineCsvReader = new LineCsvReader(csvOptions)
    val firstLine = findFirstLine(csvOptions, rdd)
    val firstRow = lineCsvReader.parseLine(firstLine)

    val header = if (csvOptions.headerFlag) {
      firstRow.zipWithIndex.map { case (value, index) =>
        if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value
      }
    } else {
      firstRow.zipWithIndex.map { case (value, index) => s"_c$index" }
    }

    val parsedRdd = tokenRdd(rdd, csvOptions, header)

    // TODO Migrate to Spark's schema inferencer eventually
    // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions)
    val schema = {
      val schemaFields = header.map { fieldName =>
        StructField(fieldName.toString, StringType, nullable = true)
      }
      StructType(schemaFields)
    }

    val ignoreMalformedRows = 0
    val internalRows = parsedRdd.flatMap { row =>
      val parser = CSVRelation.csvParser(schema, header, csvOptions)
      parser(row, ignoreMalformedRows)
    }

    val sparkSession = sparkSQLSession.getSparkSession
    Dataset.ofRows(
      sparkSession,
      LogicalRDD(
        schema.toAttributes,
        internalRows)(sparkSession))
  }

  private def tokenRdd(
      rdd: RDD[String],
      options: CSVOptions,
      header: Array[String]): RDD[Array[String]] = {
    // Make sure firstLine is materialized before sending to executors
    val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null
    CSVRelation.univocityTokenizer(rdd, header, firstLine, options)
  }

  private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = {
    if (options.isCommentSet) {
      val comment = options.comment.toString
      rdd.filter { line =>
        line.trim.nonEmpty && !line.startsWith(comment)
      }.first()
    } else {
      rdd.filter { line =>
        line.trim.nonEmpty
      }.first()
    }
  }

}