org.apache.spark.sql.catalyst.optimizer.Optimizer Scala Examples

The following examples show how to use org.apache.spark.sql.catalyst.optimizer.Optimizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: SparkOptimizer.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.ExperimentalMethods
import org.apache.spark.sql.catalyst.catalog.SessionCatalog
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
import org.apache.spark.sql.internal.SQLConf

class SparkOptimizer(
    catalog: SessionCatalog,
    conf: SQLConf,
    experimentalMethods: ExperimentalMethods)
  extends Optimizer(catalog, conf) {

  override def batches: Seq[Batch] = super.batches :+
    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+
    Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
    Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
}

Example 2

Source File: SparkOptimizer.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.ExperimentalMethods
import org.apache.spark.sql.catalyst.catalog.SessionCatalog
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaPruning
import org.apache.spark.sql.execution.python.{ExtractPythonUDFFromAggregate, ExtractPythonUDFs}

class SparkOptimizer(
    catalog: SessionCatalog,
    experimentalMethods: ExperimentalMethods)
  extends Optimizer(catalog) {

  override def defaultBatches: Seq[Batch] = (preOptimizationBatches ++ super.defaultBatches :+
    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
    Batch("Extract Python UDFs", Once,
      Seq(ExtractPythonUDFFromAggregate, ExtractPythonUDFs): _*) :+
    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
    Batch("Parquet Schema Pruning", Once, ParquetSchemaPruning)) ++
    postHocOptimizationBatches :+
    Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)

  override def nonExcludableRules: Seq[String] =
    super.nonExcludableRules :+ ExtractPythonUDFFromAggregate.ruleName

  
   def postHocOptimizationBatches: Seq[Batch] = Nil
}

Example 3

Source File: ExtendableHiveContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.ParserDialect
import org.apache.spark.sql.catalyst.analysis.{Analyzer, _}
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.execution.ui.SQLListener
import org.apache.spark.sql.execution.{CacheManager, ExtractPythonUDFs}
import org.apache.spark.sql.extension._
import org.apache.spark.sql.hive.client.{ClientInterface, ClientWrapper}
import org.apache.spark.sql.sources.commands.hive.HiveEmulationCatalog


  @transient
  override protected[sql] lazy val analyzer: Analyzer =
    new Analyzer(catalog, functionRegistry, conf) {
      override val extendedResolutionRules = resolutionRules(this) ++
        (catalog.ParquetConversions ::
          catalog.CreateTables ::
          catalog.PreInsertionCasts ::
          ExtractPythonUDFs ::
          ResolveHiveWindowFunction ::
          PreInsertCastAndRename ::
          Nil)

      override val extendedCheckRules = ExtendableHiveContext.this.extendedCheckRules(this)
    }

  @transient
  override protected[sql] lazy val optimizer: Optimizer =
    OptimizerFactory.produce(
      earlyBatches = optimizerEarlyBatches,
      mainBatchRules = optimizerMainBatchRules,
      postBatches = optimizerPostBatches
    )

  @transient
  override protected[sql] val planner: SparkPlanner with HiveStrategies =
    new SparkPlanner with HiveStrategies with ExtendedPlanner {
      def baseStrategies(hiveContext: HiveContext): Seq[Strategy] =
        Seq(
          DataSourceStrategy,
          HiveCommandStrategy(self),
          HiveDDLStrategy,
          DDLStrategy,
          TakeOrderedAndProject,
          InMemoryScans,
          HiveTableScans,
          DataSinks,
          Scripts,
          Aggregation,
          LeftSemiJoin,
          EquiJoinSelection,
          BasicOperators,
          BroadcastNestedLoop,
          CartesianProduct,
          DefaultJoin
        )

      override def strategies: Seq[Strategy] =
        self.strategies(this) ++
          experimental.extraStrategies ++
          baseStrategies(self)

      override val hiveContext = self
    }
}

Example 4

Source File: OptimizerFactory.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.extension

import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule



  def produce(earlyBatches: Seq[ExtendableOptimizerBatch] = Nil,
              mainBatchRules: Seq[Rule[LogicalPlan]] = Nil,
              postBatches: Seq[ExtendableOptimizerBatch] = Nil): Optimizer = {
    if (org.apache.spark.SPARK_VERSION.contains("1.6.2")) {
      new ExtendableOptimizer162(earlyBatches, mainBatchRules, postBatches)
    } else {
      new ExtendableOptimizer161(earlyBatches, mainBatchRules, postBatches)
    }
  }
}

Example 5

Source File: ExtendableSQLContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.extension

import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.ParserDialect
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.ExtractPythonUDFs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources.commands.hive.HiveEmulationCatalog


  @transient
  override protected[sql] val planner =
  // HiveStrategies defines its own strategies, we should be back to SparkPlanner strategies
    new SparkPlanner with ExtendedPlanner {

      def baseStrategies: Seq[Strategy] =
        DataSourceStrategy ::
          DDLStrategy ::
          TakeOrderedAndProject ::
          Aggregation ::
          LeftSemiJoin ::
          EquiJoinSelection ::
          InMemoryScans ::
          BasicOperators ::
          BroadcastNestedLoop ::
          CartesianProduct ::
          DefaultJoin :: Nil

      override def strategies: Seq[Strategy] =
        self.strategies(this) ++
          experimental.extraStrategies ++
          baseStrategies
    }
}

Example 6

Source File: ExtendableOptimizerSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.extension

import org.apache.spark.sql.catalyst.optimizer.{FiltersReduction, Optimizer}
import org.apache.spark.sql.extension.OptimizerFactory.ExtendableOptimizerBatch
import org.scalatest.{FunSuite, PrivateMethodTester}

class ExtendableOptimizerSuite extends FunSuite with PrivateMethodTester {

  implicit class OptimizerOps(opt: Optimizer) {
    private val nameMethod = PrivateMethod[String]('name)
    private def batches: Seq[AnyRef] = {
      
      val clazz = opt.getClass
      val batchesMethod = clazz.getMethods.find(_.getName == "batches").get
      batchesMethod.setAccessible(true)
      batchesMethod.invoke(opt).asInstanceOf[Seq[AnyRef]]
    }
    def batchNames: Seq[String] =
      batches map { b => b invokePrivate nameMethod() }
  }

  test("No rules is equivalent to DefaultOptimizer") {
    val extOpt = OptimizerFactory.produce()
    val defOpt = OptimizerFactoryForTests.default()
    assert(extOpt.batchNames == defOpt.batchNames)
  }

  test("One early batch is added before the main optimizer batch") {
    val extOpt = OptimizerFactory.produce(
      earlyBatches = ExtendableOptimizerBatch("FOO", 1, FiltersReduction :: Nil) :: Nil
    )

    assert(extOpt.batchNames match {
      case subQueries :: early :: other => early.equals("FOO")
    })
  }

  test("Several early batches are added before the main optimizer batch") {
    val extOpt = OptimizerFactory.produce(
      earlyBatches = ExtendableOptimizerBatch("FOO", 1, FiltersReduction :: Nil) ::
        ExtendableOptimizerBatch("BAR", 1, FiltersReduction :: Nil) ::
        Nil
    )

    assert(extOpt.batchNames match {
      case subQueries :: firstEarly :: secondEarly :: other =>
        firstEarly.equals("FOO") && secondEarly.equals("BAR")
    })
  }

  test("Expression rules are added") {
    val extOpt = OptimizerFactory.produce(
      mainBatchRules = FiltersReduction :: Nil
    )
    val defOpt = OptimizerFactoryForTests.default()
    assert(extOpt.batchNames == defOpt.batchNames)
  }

  test("Both rules are added") {
    val extOpt = OptimizerFactory.produce(
      earlyBatches = ExtendableOptimizerBatch("FOO", 1, FiltersReduction :: Nil) :: Nil,
      mainBatchRules = FiltersReduction :: Nil
    )
    val defOpt = OptimizerFactoryForTests.default()
    assert(extOpt.batchNames.toSet ==
      defOpt.batchNames.toSet ++ Seq("FOO"))
  }
}

Example 7

Source File: SparkOptimizer.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.ExperimentalMethods
import org.apache.spark.sql.catalyst.catalog.SessionCatalog
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
import org.apache.spark.sql.internal.SQLConf

class SparkOptimizer(
    catalog: SessionCatalog,
    conf: SQLConf,
    experimentalMethods: ExperimentalMethods)
  extends Optimizer(catalog, conf) {

  override def batches: Seq[Batch] = super.batches :+
    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+
    Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
    Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
}

Example 8

Source File: SparkOptimizer.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.ExperimentalMethods
import org.apache.spark.sql.catalyst.catalog.SessionCatalog
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
import org.apache.spark.sql.internal.SQLConf

class SparkOptimizer(
    catalog: SessionCatalog,
    conf: SQLConf,
    experimentalMethods: ExperimentalMethods)
  extends Optimizer(catalog, conf) {

  override def batches: Seq[Batch] = super.batches :+
    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+
    Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
    Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
}