org.apache.spark.internal.config.ConfigEntry Scala Examples

The following examples show how to use org.apache.spark.internal.config.ConfigEntry. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: ReadOnlySQLConf.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.internal

import java.util.{Map => JMap}

import org.apache.spark.TaskContext
import org.apache.spark.internal.config.{ConfigEntry, ConfigProvider, ConfigReader}


class ReadOnlySQLConf(context: TaskContext) extends SQLConf {

  @transient override val settings: JMap[String, String] = {
    context.getLocalProperties.asInstanceOf[JMap[String, String]]
  }

  @transient override protected val reader: ConfigReader = {
    new ConfigReader(new TaskContextConfigProvider(context))
  }

  override protected def setConfWithCheck(key: String, value: String): Unit = {
    throw new UnsupportedOperationException("Cannot mutate ReadOnlySQLConf.")
  }

  override def unsetConf(key: String): Unit = {
    throw new UnsupportedOperationException("Cannot mutate ReadOnlySQLConf.")
  }

  override def unsetConf(entry: ConfigEntry[_]): Unit = {
    throw new UnsupportedOperationException("Cannot mutate ReadOnlySQLConf.")
  }

  override def clear(): Unit = {
    throw new UnsupportedOperationException("Cannot mutate ReadOnlySQLConf.")
  }

  override def clone(): SQLConf = {
    throw new UnsupportedOperationException("Cannot clone/copy ReadOnlySQLConf.")
  }

  override def copy(entries: (ConfigEntry[_], Any)*): SQLConf = {
    throw new UnsupportedOperationException("Cannot clone/copy ReadOnlySQLConf.")
  }
}

class TaskContextConfigProvider(context: TaskContext) extends ConfigProvider {
  override def get(key: String): Option[String] = Option(context.getLocalProperty(key))
}

Example 2

Source File: RemoteShuffleConf.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.shuffle.remote

import org.apache.spark.internal.config.{ConfigBuilder, ConfigEntry}

object RemoteShuffleConf {

  val STORAGE_MASTER_URI: ConfigEntry[String] =
    ConfigBuilder("spark.shuffle.remote.storageMasterUri")
      .doc("Contact this storage master while persisting shuffle files")
      .stringConf
      .createWithDefault("hdfs://localhost:9001")

  val STORAGE_HDFS_MASTER_UI_PORT: ConfigEntry[String] =
    ConfigBuilder("spark.shuffle.remote.hdfs.storageMasterUIPort")
      .doc("Contact this UI port to retrieve HDFS configurations")
      .stringConf
      .createWithDefault("50070")

  val SHUFFLE_FILES_ROOT_DIRECTORY: ConfigEntry[String] =
    ConfigBuilder("spark.shuffle.remote.filesRootDirectory")
      .doc("Use this as the root directory for shuffle files")
      .stringConf
      .createWithDefault("/shuffle")

  val DFS_REPLICATION: ConfigEntry[Int] =
    ConfigBuilder("spark.shuffle.remote.hdfs.replication")
      .doc("The default replication of remote storage system, will override dfs.replication" +
        " when HDFS is used as shuffling storage")
      .intConf
      .createWithDefault(3)

  val REMOTE_OPTIMIZED_SHUFFLE_ENABLED: ConfigEntry[Boolean] =
    ConfigBuilder("spark.shuffle.remote.optimizedPathEnabled")
      .doc("Enable using unsafe-optimized shuffle writer")
      .internal()
      .booleanConf
      .createWithDefault(true)

  val REMOTE_BYPASS_MERGE_THRESHOLD: ConfigEntry[Int] =
    ConfigBuilder("spark.shuffle.remote.bypassMergeThreshold")
      .doc("Remote shuffle manager uses this threshold to decide using bypass-merge(hash-based)" +
        "shuffle or not, a new configuration is introduced(and it's -1 by default) because we" +
        " want to explicitly make disabling hash-based shuffle writer as the default behavior." +
        " When memory is relatively sufficient, using sort-based shuffle writer in remote shuffle" +
        " is often more efficient than the hash-based one. Because the bypass-merge shuffle " +
        "writer proceeds I/O of 3x total shuffle size: 1 time for read I/O and 2 times for write" +
        " I/Os, and this can be an even larger overhead under remote shuffle, the 3x shuffle size" +
        " is gone through network, arriving at remote storage system.")
      .intConf
      .createWithDefault(-1)

  val REMOTE_INDEX_CACHE_SIZE: ConfigEntry[String] =
    ConfigBuilder("spark.shuffle.remote.index.cache.size")
      .doc("This index file cache resides in each executor. If it's a positive value, index " +
        "cache will be turned on: instead of reading index files directly from remote storage" +
        ", a reducer will fetch the index files from the executors that write them through" +
        " network. And those executors will return the index files kept in cache. (read them" +
        "from storage if needed)")
      .stringConf
      .createWithDefault("0")

  val NUM_TRANSFER_SERVICE_THREADS: ConfigEntry[Int] =
    ConfigBuilder("spark.shuffle.remote.numIndexReadThreads")
      .doc("The maximum number of server/client threads used in RemoteShuffleTransferService for" +
        "index files transferring")
      .intConf
      .createWithDefault(3)

  val NUM_CONCURRENT_FETCH: ConfigEntry[Int] =
    ConfigBuilder("spark.shuffle.remote.numReadThreads")
      .doc("The maximum number of concurrent reading threads fetching shuffle data blocks")
      .intConf
      .createWithDefault(Runtime.getRuntime.availableProcessors())

  val REUSE_FILE_HANDLE: ConfigEntry[Boolean] =
    ConfigBuilder("spark.shuffle.remote.reuseFileHandle")
      .doc("By switching on this feature, the file handles returned by Filesystem open operations" +
        " will be cached/reused inside an executor(across different rounds of reduce tasks)," +
        " eliminating open overhead. This should improve the reduce stage performance only when" +
        " file open operations occupy majority of the time, e.g. There is a large number of" +
        " shuffle blocks, each reading a fairly small block of data, and there is no other" +
        " compute in the reduce stage.")
      .booleanConf
      .createWithDefault(false)

  val DATA_FETCH_EAGER_REQUIREMENT: ConfigEntry[Boolean] =
    ConfigBuilder("spark.shuffle.remote.eagerRequirementDataFetch")
      .doc("With eager requirement = false, a shuffle block will be counted ready and served for" +
        " compute until all content of the block is put in Spark's local memory. With eager " +
        "requirement = true, a shuffle block will be served to later compute after the bytes " +
        "required is fetched and put in memory")
      .booleanConf
      .createWithDefault(false)

}

Example 3

Source File: IndexConf.scala From parquet-index with Apache License 2.0

5 votes

package org.apache.spark.sql.internal

import org.apache.spark.internal.config.ConfigEntry
import org.apache.spark.sql.SparkSession

object IndexConf {
  import SQLConf.buildConf

  val METASTORE_LOCATION = buildConf("spark.sql.index.metastore").
    doc("Metastore location or root directory to store index information, will be created " +
      "if path does not exist").
    stringConf.
    createWithDefault("")

  val CREATE_IF_NOT_EXISTS = buildConf("spark.sql.index.createIfNotExists").
    doc("When set to true, creates index if one does not exist in metastore for the table").
    booleanConf.
    createWithDefault(false)

  val NUM_PARTITIONS = buildConf("spark.sql.index.partitions").
    doc("When creating index uses this number of partitions. If value is non-positive or not " +
      "provided then uses `sc.defaultParallelism * 3` or `spark.sql.shuffle.partitions` " +
      "configuration value, whichever is smaller").
    intConf.
    createWithDefault(0)

  val PARQUET_FILTER_STATISTICS_ENABLED =
    buildConf("spark.sql.index.parquet.filter.enabled").
    doc("When set to true, writes filter statistics for indexed columns when creating table " +
      "index, otherwise only min/max statistics are used. Filter statistics are always used " +
      "during filtering stage, if applicable").
    booleanConf.
    createWithDefault(true)

  val PARQUET_FILTER_STATISTICS_TYPE = buildConf("spark.sql.index.parquet.filter.type").
    doc("When filter statistics enabled, selects type of statistics to use when creating index. " +
      "Available options are `bloom`, `dict`").
    stringConf.
    createWithDefault("bloom")

  val PARQUET_FILTER_STATISTICS_EAGER_LOADING =
    buildConf("spark.sql.index.parquet.filter.eagerLoading").
    doc("When set to true, read and load all filter statistics in memory the first time catalog " +
      "is resolved, otherwise load them lazily as needed when evaluating predicate. " +
      "Eager loading removes IO of reading filter data from disk, but requires extra memory").
    booleanConf.
    createWithDefault(false)

  
  def unsetConf(entry: ConfigEntry[_]): Unit = {
    sqlConf.unsetConf(entry)
  }
}