org.apache.spark.sql.catalyst.util.CaseInsensitiveMap Scala Examples

The following examples show how to use org.apache.spark.sql.catalyst.util.CaseInsensitiveMap. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MQTTUtils.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
package org.apache.bahir.sql.streaming.mqtt

import java.util.Properties

import org.eclipse.paho.client.mqttv3.{MqttClient, MqttClientPersistence, MqttConnectOptions}
import org.eclipse.paho.client.mqttv3.persist.{MemoryPersistence, MqttDefaultFilePersistence}

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap

import org.apache.bahir.utils.Logging

object MQTTUtils extends Logging {
  // Since data source configuration properties are case-insensitive,
  // we have to introduce our own keys. Also, good for vendor independence.
  private[mqtt] val sslParamMapping = Map(
    "ssl.protocol" -> "com.ibm.ssl.protocol",
    "ssl.key.store" -> "com.ibm.ssl.keyStore",
    "ssl.key.store.password" -> "com.ibm.ssl.keyStorePassword",
    "ssl.key.store.type" -> "com.ibm.ssl.keyStoreType",
    "ssl.key.store.provider" -> "com.ibm.ssl.keyStoreProvider",
    "ssl.trust.store" -> "com.ibm.ssl.trustStore",
    "ssl.trust.store.password" -> "com.ibm.ssl.trustStorePassword",
    "ssl.trust.store.type" -> "com.ibm.ssl.trustStoreType",
    "ssl.trust.store.provider" -> "com.ibm.ssl.trustStoreProvider",
    "ssl.ciphers" -> "com.ibm.ssl.enabledCipherSuites",
    "ssl.key.manager" -> "com.ibm.ssl.keyManager",
    "ssl.trust.manager" -> "com.ibm.ssl.trustManager"
  )

  def parseConfigParams(config: Map[String, String]):
      (String, String, String, MqttClientPersistence, MqttConnectOptions, Int, Long, Long, Int) = {
    def e(s: String) = new IllegalArgumentException(s)
    val parameters = CaseInsensitiveMap(config)

    val brokerUrl: String = parameters.getOrElse("brokerUrl", parameters.getOrElse("path",
      throw e("Please provide a `brokerUrl` by specifying path or .options(\"brokerUrl\",...)")))

    val persistence: MqttClientPersistence = parameters.get("persistence") match {
      case Some("memory") => new MemoryPersistence()
      case _ => val localStorage: Option[String] = parameters.get("localStorage")
        localStorage match {
          case Some(x) => new MqttDefaultFilePersistence(x)
          case None => new MqttDefaultFilePersistence()
        }
    }

    // if default is subscribe everything, it leads to getting lot unwanted system messages.
    val topic: String = parameters.getOrElse("topic",
      throw e("Please specify a topic, by .options(\"topic\",...)"))

    val clientId: String = parameters.getOrElse("clientId", {
      log.warn("If `clientId` is not set, a random value is picked up." +
        "\nRecovering from failure is not supported in such a case.")
      MqttClient.generateClientId()})

    val username: Option[String] = parameters.get("username")
    val password: Option[String] = parameters.get("password")
    val connectionTimeout: Int = parameters.getOrElse("connectionTimeout",
      MqttConnectOptions.CONNECTION_TIMEOUT_DEFAULT.toString).toInt
    val keepAlive: Int = parameters.getOrElse("keepAlive", MqttConnectOptions
      .KEEP_ALIVE_INTERVAL_DEFAULT.toString).toInt
    val mqttVersion: Int = parameters.getOrElse("mqttVersion", MqttConnectOptions
      .MQTT_VERSION_DEFAULT.toString).toInt
    val cleanSession: Boolean = parameters.getOrElse("cleanSession", "false").toBoolean
    val qos: Int = parameters.getOrElse("QoS", "1").toInt
    val autoReconnect: Boolean = parameters.getOrElse("autoReconnect", "false").toBoolean
    val maxInflight: Int = parameters.getOrElse("maxInflight", "60").toInt

    val maxBatchMessageNum = parameters.getOrElse("maxBatchMessageNum", s"${Long.MaxValue}").toLong
    val maxBatchMessageSize = parameters.getOrElse("maxBatchMessageSize",
      s"${Long.MaxValue}").toLong
    val maxRetryNumber = parameters.getOrElse("maxRetryNum", "3").toInt

    val mqttConnectOptions: MqttConnectOptions = new MqttConnectOptions()
    mqttConnectOptions.setAutomaticReconnect(autoReconnect)
    mqttConnectOptions.setCleanSession(cleanSession)
    mqttConnectOptions.setConnectionTimeout(connectionTimeout)
    mqttConnectOptions.setKeepAliveInterval(keepAlive)
    mqttConnectOptions.setMqttVersion(mqttVersion)
    mqttConnectOptions.setMaxInflight(maxInflight)
    (username, password) match {
      case (Some(u: String), Some(p: String)) =>
        mqttConnectOptions.setUserName(u)
        mqttConnectOptions.setPassword(p.toCharArray)
      case _ =>
    }
    val sslProperties = new Properties()
    config.foreach(e => {
      if (e._1.startsWith("ssl.")) {
        sslProperties.setProperty(sslParamMapping(e._1), e._2)
      }
    })
    mqttConnectOptions.setSSLProperties(sslProperties)

    (brokerUrl, clientId, topic, persistence, mqttConnectOptions, qos,
      maxBatchMessageNum, maxBatchMessageSize, maxRetryNumber)
  }
} 
Example 2
Source File: HiveAcidSinkOptions.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package com.qubole.spark.hiveacid.streaming

import java.util.concurrent.TimeUnit

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap

import scala.util.Try

class HiveAcidSinkOptions(parameters: CaseInsensitiveMap[String]) {

  import HiveAcidSinkOptions._

  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))

  val tableName = parameters.get("table").getOrElse{
    throw new IllegalArgumentException("Table Name is not specified")
  }

  val fileCleanupDelayMs = withLongParameter(CLEANUP_DELAY_KEY, DEFAULT_CLEANUP_DELAY)

  val isDeletingExpiredLog = withBooleanParameter(LOG_DELETION_KEY, DEFAULT_LOG_DELETION)

  val compactInterval = withIntParameter(COMPACT_INTERVAL_KEY, DEFAULT_COMPACT_INTERVAL)

  val minBatchesToRetain = withIntParameter(MIN_BATCHES_TO_RETAIN_KEY, DEFAULT_MIN_BATCHES_TO_RETAIN)

  val metadataDir = parameters.get(METADATA_DIR_KEY)

  private def withIntParameter(name: String, default: Int): Int = {
    parameters.get(name).map { str =>
      Try(str.toInt).toOption.filter(_ > 0).getOrElse {
        throw new IllegalArgumentException(
          s"Invalid value '$str' for option '$name', must be a positive integer")
      }
    }.getOrElse(default)
  }

  private def withLongParameter(name: String, default: Long): Long = {
    parameters.get(name).map { str =>
      Try(str.toLong).toOption.filter(_ >= 0).getOrElse {
        throw new IllegalArgumentException(
          s"Invalid value '$str' for option '$name', must be a positive integer")
      }
    }.getOrElse(default)
  }

  private def withBooleanParameter(name: String, default: Boolean): Boolean = {
    parameters.get(name).map { str =>
      try {
        str.toBoolean
      } catch {
        case _: IllegalArgumentException =>
          throw new IllegalArgumentException(
            s"Invalid value '$str' for option '$name', must be true or false")
      }
    }.getOrElse(default)
  }

}

object HiveAcidSinkOptions {

  val DEFAULT_CLEANUP_DELAY = TimeUnit.MINUTES.toMillis(10)
  val DEFAULT_LOG_DELETION = true
  val DEFAULT_COMPACT_INTERVAL = 10
  val DEFAULT_MIN_BATCHES_TO_RETAIN = 100

  val CLEANUP_DELAY_KEY = "spark.acid.streaming.log.cleanupDelayMs"
  val LOG_DELETION_KEY = "spark.acid.streaming.log.deletion"
  val COMPACT_INTERVAL_KEY = "spark.acid.streaming.log.compactInterval"
  val MIN_BATCHES_TO_RETAIN_KEY = "spark.acid.streaming.log.minBatchesToRetain"
  val METADATA_DIR_KEY = "spark.acid.streaming.log.metadataDir"

} 
Example 3
Source File: FileStreamOptions.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import scala.util.Try

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.util.Utils


  val fileNameOnly: Boolean = withBooleanParameter("fileNameOnly", false)

  private def withBooleanParameter(name: String, default: Boolean) = {
    parameters.get(name).map { str =>
      try {
        str.toBoolean
      } catch {
        case _: IllegalArgumentException =>
          throw new IllegalArgumentException(
            s"Invalid value '$str' for option '$name', must be 'true' or 'false'")
      }
    }.getOrElse(default)
  }
} 
Example 4
Source File: ParquetOptions.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.parquet

import java.util.Locale

import org.apache.parquet.hadoop.ParquetOutputFormat
import org.apache.parquet.hadoop.metadata.CompressionCodecName

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.internal.SQLConf


  val mergeSchema: Boolean = parameters
    .get(MERGE_SCHEMA)
    .map(_.toBoolean)
    .getOrElse(sqlConf.isParquetSchemaMergingEnabled)
}


object ParquetOptions {
  val MERGE_SCHEMA = "mergeSchema"

  // The parquet compression short names
  private val shortParquetCompressionCodecNames = Map(
    "none" -> CompressionCodecName.UNCOMPRESSED,
    "uncompressed" -> CompressionCodecName.UNCOMPRESSED,
    "snappy" -> CompressionCodecName.SNAPPY,
    "gzip" -> CompressionCodecName.GZIP,
    "lzo" -> CompressionCodecName.LZO)

  def getParquetCompressionCodecName(name: String): String = {
    shortParquetCompressionCodecNames(name).name()
  }
} 
Example 5
Source File: OrcOptions.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.orc

import java.util.Locale

import org.apache.orc.OrcConf.COMPRESS

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.internal.SQLConf


  val compressionCodec: String = {
    // `compression`, `orc.compress`(i.e., OrcConf.COMPRESS), and `spark.sql.orc.compression.codec`
    // are in order of precedence from highest to lowest.
    val orcCompressionConf = parameters.get(COMPRESS.getAttribute)
    val codecName = parameters
      .get("compression")
      .orElse(orcCompressionConf)
      .getOrElse(sqlConf.orcCompressionCodec)
      .toLowerCase(Locale.ROOT)
    if (!shortOrcCompressionCodecNames.contains(codecName)) {
      val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase(Locale.ROOT))
      throw new IllegalArgumentException(s"Codec [$codecName] " +
        s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
    }
    shortOrcCompressionCodecNames(codecName)
  }
}

object OrcOptions {
  // The ORC compression short names
  private val shortOrcCompressionCodecNames = Map(
    "none" -> "NONE",
    "uncompressed" -> "NONE",
    "snappy" -> "SNAPPY",
    "zlib" -> "ZLIB",
    "lzo" -> "LZO")

  def getORCCompressionCodecName(name: String): String = shortOrcCompressionCodecNames(name)
} 
Example 6
Source File: LibSVMOptions.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.source.libsvm

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap


  val numFeatures = parameters.get(NUM_FEATURES).map(_.toInt).filter(_ > 0)

  val isSparse = parameters.getOrElse(VECTOR_TYPE, SPARSE_VECTOR_TYPE) match {
    case SPARSE_VECTOR_TYPE => true
    case DENSE_VECTOR_TYPE => false
    case o => throw new IllegalArgumentException(s"Invalid value `$o` for parameter " +
      s"`$VECTOR_TYPE`. Expected types are `sparse` and `dense`.")
  }
}

private[libsvm] object LibSVMOptions {
  val NUM_FEATURES = "numFeatures"
  val VECTOR_TYPE = "vectorType"
  val DENSE_VECTOR_TYPE = "dense"
  val SPARSE_VECTOR_TYPE = "sparse"
} 
Example 7
Source File: FileStreamOptions.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import scala.util.Try

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.util.Utils


  val latestFirst: Boolean = parameters.get("latestFirst").map { str =>
    try {
      str.toBoolean
    } catch {
      case _: IllegalArgumentException =>
        throw new IllegalArgumentException(
          s"Invalid value '$str' for option 'latestFirst', must be 'true' or 'false'")
    }
  }.getOrElse(false)
} 
Example 8
Source File: ParquetOptions.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.parquet

import org.apache.parquet.hadoop.metadata.CompressionCodecName

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.internal.SQLConf


  val mergeSchema: Boolean = parameters
    .get(MERGE_SCHEMA)
    .map(_.toBoolean)
    .getOrElse(sqlConf.isParquetSchemaMergingEnabled)
}


object ParquetOptions {
  val MERGE_SCHEMA = "mergeSchema"

  // The parquet compression short names
  private val shortParquetCompressionCodecNames = Map(
    "none" -> CompressionCodecName.UNCOMPRESSED,
    "uncompressed" -> CompressionCodecName.UNCOMPRESSED,
    "snappy" -> CompressionCodecName.SNAPPY,
    "gzip" -> CompressionCodecName.GZIP,
    "lzo" -> CompressionCodecName.LZO)
} 
Example 9
Source File: OrcOptions.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.orc

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap


  val compressionCodec: String = {
    // `orc.compress` is a ORC configuration. So, here we respect this as an option but
    // `compression` has higher precedence than `orc.compress`. It means if both are set,
    // we will use `compression`.
    val orcCompressionConf = parameters.get(OrcRelation.ORC_COMPRESSION)
    val codecName = parameters
      .get("compression")
      .orElse(orcCompressionConf)
      .getOrElse("snappy").toLowerCase
    if (!shortOrcCompressionCodecNames.contains(codecName)) {
      val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase)
      throw new IllegalArgumentException(s"Codec [$codecName] " +
        s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
    }
    shortOrcCompressionCodecNames(codecName)
  }
}

private[orc] object OrcOptions {
  // The ORC compression short names
  private val shortOrcCompressionCodecNames = Map(
    "none" -> "NONE",
    "uncompressed" -> "NONE",
    "snappy" -> "SNAPPY",
    "zlib" -> "ZLIB",
    "lzo" -> "LZO")
} 
Example 10
Source File: DeltaGenerateCommand.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta.commands

import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaTableIdentifier}
import org.apache.spark.sql.delta.hooks.GenerateSymlinkManifest
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.execution.command.RunnableCommand

case class DeltaGenerateCommand(modeName: String, tableId: TableIdentifier)
  extends RunnableCommand {

  import DeltaGenerateCommand._

  override def run(sparkSession: SparkSession): Seq[Row] = {
    if (!modeNameToGenerationFunc.contains(modeName)) {
      throw DeltaErrors.unsupportedGenerateModeException(modeName)
    }

    val tablePath = DeltaTableIdentifier(sparkSession, tableId) match {
      case Some(id) if id.path.isDefined =>
        new Path(id.path.get)
      case _ =>
        new Path(sparkSession.sessionState.catalog.getTableMetadata(tableId).location)
    }

    val deltaLog = DeltaLog.forTable(sparkSession, tablePath)
    if (deltaLog.snapshot.version < 0) {
      throw DeltaErrors.notADeltaTableException("GENERATE")
    }
    val generationFunc = modeNameToGenerationFunc(modeName)
    generationFunc(sparkSession, deltaLog)
    Seq.empty
  }
}

object DeltaGenerateCommand {
  val modeNameToGenerationFunc = CaseInsensitiveMap(
    Map[String, (SparkSession, DeltaLog) => Unit](
    "symlink_format_manifest" -> GenerateSymlinkManifest.generateFullManifest
  ))
} 
Example 11
Source File: OrcOptions.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.orc

import java.util.Locale

import org.apache.orc.OrcConf.COMPRESS

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.internal.SQLConf


  val compressionCodec: String = {
    // `compression`, `orc.compress`(i.e., OrcConf.COMPRESS), and `spark.sql.orc.compression.codec`
    // are in order of precedence from highest to lowest.
    val orcCompressionConf = parameters.get(COMPRESS.getAttribute)
    val codecName = parameters
      .get("compression")
      .orElse(orcCompressionConf)
      .getOrElse(sqlConf.orcCompressionCodec)
      .toLowerCase(Locale.ROOT)
    if (!shortOrcCompressionCodecNames.contains(codecName)) {
      val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase(Locale.ROOT))
      throw new IllegalArgumentException(s"Codec [$codecName] " +
        s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
    }
    shortOrcCompressionCodecNames(codecName)
  }
}

object OrcOptions {
  // The ORC compression short names
  private val shortOrcCompressionCodecNames = Map(
    "none" -> "NONE",
    "uncompressed" -> "NONE",
    "snappy" -> "SNAPPY",
    "zlib" -> "ZLIB",
    "lzo" -> "LZO")

  def getORCCompressionCodecName(name: String): String = shortOrcCompressionCodecNames(name)
} 
Example 12
Source File: DefaultSource.scala    From memsql-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.memsql.spark

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.metrics.source.MetricsHandler
import org.apache.spark.sql.sources.{
  BaseRelation,
  CreatableRelationProvider,
  DataSourceRegister,
  RelationProvider
}
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

object DefaultSource {
  val MEMSQL_SOURCE_NAME          = "com.memsql.spark"
  val MEMSQL_SOURCE_NAME_SHORT    = "memsql"
  val MEMSQL_GLOBAL_OPTION_PREFIX = "spark.datasource.memsql."
}

class DefaultSource
    extends RelationProvider
    with DataSourceRegister
    with CreatableRelationProvider
    with LazyLogging {

  override def shortName: String = DefaultSource.MEMSQL_SOURCE_NAME_SHORT

  private def includeGlobalParams(sqlContext: SQLContext,
                                  params: Map[String, String]): Map[String, String] =
    sqlContext.getAllConfs.foldLeft(params)({
      case (params, (k, v)) if k.startsWith(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) =>
        params + (k.stripPrefix(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) -> v)
      case (params, _) => params
    })

  override def createRelation(sqlContext: SQLContext,
                              parameters: Map[String, String]): BaseRelation = {
    val params  = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters))
    val options = MemsqlOptions(params)
    if (options.disablePushdown) {
      SQLPushdownRule.ensureRemoved(sqlContext.sparkSession)
      MemsqlReaderNoPushdown(MemsqlOptions.getQuery(params), options, sqlContext)
    } else {
      SQLPushdownRule.ensureInjected(sqlContext.sparkSession)
      MemsqlReader(MemsqlOptions.getQuery(params), Nil, options, sqlContext)
    }
  }

  override def createRelation(sqlContext: SQLContext,
                              mode: SaveMode,
                              parameters: Map[String, String],
                              data: DataFrame): BaseRelation = {
    val opts = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters))
    val conf = MemsqlOptions(opts)

    val table = MemsqlOptions
      .getTable(opts)
      .getOrElse(
        throw new IllegalArgumentException(
          s"To write a dataframe to MemSQL you must specify a table name via the '${MemsqlOptions.TABLE_NAME}' parameter"
        )
      )
    JdbcHelpers.prepareTableForWrite(conf, table, mode, data.schema)
    val isReferenceTable = JdbcHelpers.isReferenceTable(conf, table)
    val partitionWriterFactory =
      if (conf.onDuplicateKeySQL.isEmpty) {
        new LoadDataWriterFactory(table, conf)
      } else {
        new BatchInsertWriterFactory(table, conf)
      }

    val schema        = data.schema
    var totalRowCount = 0L
    data.foreachPartition(partition => {
      val writer = partitionWriterFactory.createDataWriter(schema,
                                                           TaskContext.getPartitionId(),
                                                           0,
                                                           isReferenceTable,
                                                           mode)
      try {
        partition.foreach(record => {
          writer.write(record)
          totalRowCount += 1
        })
        writer.commit()
        MetricsHandler.setRecordsWritten(totalRowCount)
      } catch {
        case e: Exception => {
          writer.abort()
          throw e
        }
      }
    })

    createRelation(sqlContext, parameters)
  }
} 
Example 13
Source File: FileStreamOptions.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import scala.util.Try

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.util.Utils


  val latestFirst: Boolean = parameters.get("latestFirst").map { str =>
    try {
      str.toBoolean
    } catch {
      case _: IllegalArgumentException =>
        throw new IllegalArgumentException(
          s"Invalid value '$str' for option 'latestFirst', must be 'true' or 'false'")
    }
  }.getOrElse(false)
} 
Example 14
Source File: ParquetOptions.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.parquet

import org.apache.parquet.hadoop.metadata.CompressionCodecName

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.internal.SQLConf


  val mergeSchema: Boolean = parameters
    .get(MERGE_SCHEMA)
    .map(_.toBoolean)
    .getOrElse(sqlConf.isParquetSchemaMergingEnabled)
}


object ParquetOptions {
  val MERGE_SCHEMA = "mergeSchema"

  // The parquet compression short names
  private val shortParquetCompressionCodecNames = Map(
    "none" -> CompressionCodecName.UNCOMPRESSED,
    "uncompressed" -> CompressionCodecName.UNCOMPRESSED,
    "snappy" -> CompressionCodecName.SNAPPY,
    "gzip" -> CompressionCodecName.GZIP,
    "lzo" -> CompressionCodecName.LZO)
} 
Example 15
Source File: OrcOptions.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.orc

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap


  val compressionCodec: String = {
    // `orc.compress` is a ORC configuration. So, here we respect this as an option but
    // `compression` has higher precedence than `orc.compress`. It means if both are set,
    // we will use `compression`.
    val orcCompressionConf = parameters.get(OrcRelation.ORC_COMPRESSION)
    val codecName = parameters
      .get("compression")
      .orElse(orcCompressionConf)
      .getOrElse("snappy").toLowerCase
    if (!shortOrcCompressionCodecNames.contains(codecName)) {
      val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase)
      throw new IllegalArgumentException(s"Codec [$codecName] " +
        s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
    }
    shortOrcCompressionCodecNames(codecName)
  }
}

private[orc] object OrcOptions {
  // The ORC compression short names
  private val shortOrcCompressionCodecNames = Map(
    "none" -> "NONE",
    "uncompressed" -> "NONE",
    "snappy" -> "SNAPPY",
    "zlib" -> "ZLIB",
    "lzo" -> "LZO")
} 
Example 16
Source File: ArrowOptions.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package com.intel.oap.spark.sql.execution.datasources.v2.arrow

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap

class ArrowOptions(@transient private val parameters: CaseInsensitiveMap[String])
  extends Serializable {

  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))

  val originalFormat = parameters
    .get(ArrowOptions.KEY_ORIGINAL_FORMAT)
    .getOrElse(ArrowOptions.DEFAULT_ORIGINAL_FORMAT)
  val filesystem = parameters
    .get(ArrowOptions.KEY_FILESYSTEM)
    .getOrElse(ArrowOptions.DEFAULT_FILESYSTEM)
}

object ArrowOptions {
  val KEY_ORIGINAL_FORMAT = "originalFormat"
  val DEFAULT_ORIGINAL_FORMAT = "parquet"
  val KEY_FILESYSTEM = "filesystem"
  val DEFAULT_FILESYSTEM = "hdfs"
} 
Example 17
Source File: FileStreamOptions.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import scala.util.Try

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.util.Utils


  val fileNameOnly: Boolean = withBooleanParameter("fileNameOnly", false)

  private def withBooleanParameter(name: String, default: Boolean) = {
    parameters.get(name).map { str =>
      try {
        str.toBoolean
      } catch {
        case _: IllegalArgumentException =>
          throw new IllegalArgumentException(
            s"Invalid value '$str' for option '$name', must be 'true' or 'false'")
      }
    }.getOrElse(default)
  }
} 
Example 18
Source File: ParquetOptions.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.parquet

import java.util.Locale

import org.apache.parquet.hadoop.ParquetOutputFormat
import org.apache.parquet.hadoop.metadata.CompressionCodecName

import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.internal.SQLConf


  val mergeSchema: Boolean = parameters
    .get(MERGE_SCHEMA)
    .map(_.toBoolean)
    .getOrElse(sqlConf.isParquetSchemaMergingEnabled)
}


object ParquetOptions {
  val MERGE_SCHEMA = "mergeSchema"

  // The parquet compression short names
  private val shortParquetCompressionCodecNames = Map(
    "none" -> CompressionCodecName.UNCOMPRESSED,
    "uncompressed" -> CompressionCodecName.UNCOMPRESSED,
    "snappy" -> CompressionCodecName.SNAPPY,
    "gzip" -> CompressionCodecName.GZIP,
    "lzo" -> CompressionCodecName.LZO,
    "lz4" -> CompressionCodecName.LZ4,
    "brotli" -> CompressionCodecName.BROTLI,
    "zstd" -> CompressionCodecName.ZSTD)

  def getParquetCompressionCodecName(name: String): String = {
    shortParquetCompressionCodecNames(name).name()
  }
} 
Example 19
Source File: TextOptions.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.text

import java.nio.charset.{Charset, StandardCharsets}

import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CompressionCodecs}


  val wholeText = parameters.getOrElse(WHOLETEXT, "false").toBoolean

  val encoding: Option[String] = parameters.get(ENCODING)

  val lineSeparator: Option[String] = parameters.get(LINE_SEPARATOR).map { lineSep =>
    require(lineSep.nonEmpty, s"'$LINE_SEPARATOR' cannot be an empty string.")

    lineSep
  }

  // Note that the option 'lineSep' uses a different default value in read and write.
  val lineSeparatorInRead: Option[Array[Byte]] = lineSeparator.map { lineSep =>
    lineSep.getBytes(encoding.map(Charset.forName(_)).getOrElse(StandardCharsets.UTF_8))
  }
  val lineSeparatorInWrite: Array[Byte] =
    lineSeparatorInRead.getOrElse("\n".getBytes(StandardCharsets.UTF_8))
}

private[datasources] object TextOptions {
  val COMPRESSION = "compression"
  val WHOLETEXT = "wholetext"
  val ENCODING = "encoding"
  val LINE_SEPARATOR = "lineSep"
} 
Example 20
Source File: JSONOptions.scala    From multi-tenancy-spark   with Apache License 2.0 4 votes vote down vote up
package org.apache.spark.sql.catalyst.json

import java.util.Locale

import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
import org.apache.commons.lang3.time.FastDateFormat

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CompressionCodecs, ParseModes}


  def setJacksonOptions(factory: JsonFactory): Unit = {
    factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments)
    factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, allowUnquotedFieldNames)
    factory.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, allowSingleQuotes)
    factory.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, allowNumericLeadingZeros)
    factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers)
    factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
      allowBackslashEscapingAnyCharacter)
  }
} 
Example 21
Source File: JSONOptions.scala    From sparkoscope   with Apache License 2.0 4 votes vote down vote up
package org.apache.spark.sql.catalyst.json

import java.util.Locale

import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
import org.apache.commons.lang3.time.FastDateFormat

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CompressionCodecs, ParseModes}


  def setJacksonOptions(factory: JsonFactory): Unit = {
    factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments)
    factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, allowUnquotedFieldNames)
    factory.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, allowSingleQuotes)
    factory.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, allowNumericLeadingZeros)
    factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers)
    factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
      allowBackslashEscapingAnyCharacter)
  }
}