Python pyspark.sql.SparkSession.builder() Examples

The following are 30 code examples of pyspark.sql.SparkSession.builder(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.SparkSession , or try the search function .
Example #1
Source File: data_source_provider.py    From marvin-python-toolbox with Apache License 2.0 6 votes vote down vote up
def get_spark_session(enable_hive=False, app_name='marvin-engine', configs=[]):
    """Return a Spark Session object"""

    # Prepare spark context to be used
    import findspark
    findspark.init()
    from pyspark.sql import SparkSession

    # prepare spark sesseion to be returned
    spark = SparkSession.builder

    spark = spark.appName(app_name)
    spark = spark.enableHiveSupport() if enable_hive else spark

    # if has configs
    for config in configs:
        spark = spark.config(config)

    return spark.getOrCreate() 
Example #2
Source File: functions.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _test():
    import doctest
    from pyspark.sql import Row, SparkSession
    import pyspark.sql.functions
    globs = pyspark.sql.functions.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.functions tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)])
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.functions, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
Example #3
Source File: distributed.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _test():
    import doctest
    import numpy
    from pyspark.sql import SparkSession
    from pyspark.mllib.linalg import Matrices
    import pyspark.mllib.linalg.distributed
    try:
        # Numpy 1.14+ changed it's string format.
        numpy.set_printoptions(legacy='1.13')
    except TypeError:
        pass
    globs = pyspark.mllib.linalg.distributed.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[2]")\
        .appName("mllib.linalg.distributed tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    globs['Matrices'] = Matrices
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
Example #4
Source File: udf.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.sql.udf
    globs = pyspark.sql.udf.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.udf tests")\
        .getOrCreate()
    globs['spark'] = spark
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.udf, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
Example #5
Source File: column.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def cast(self, dataType):
        """ Convert the column into type ``dataType``.

        >>> df.select(df.age.cast("string").alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        """
        if isinstance(dataType, basestring):
            jc = self._jc.cast(dataType)
        elif isinstance(dataType, DataType):
            from pyspark.sql import SparkSession
            spark = SparkSession.builder.getOrCreate()
            jdt = spark._jsparkSession.parseDataType(dataType.json())
            jc = self._jc.cast(jdt)
        else:
            raise TypeError("unexpected type: %s" % type(dataType))
        return Column(jc) 
Example #6
Source File: column.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.sql.column
    globs = pyspark.sql.column.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.column tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['spark'] = spark
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.column, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
Example #7
Source File: fpm.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.mllib.fpm
    globs = pyspark.mllib.fpm.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("mllib.fpm tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    import tempfile

    temp_path = tempfile.mkdtemp()
    globs['temp_path'] = temp_path
    try:
        (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
        spark.stop()
    finally:
        from shutil import rmtree
        try:
            rmtree(temp_path)
        except OSError:
            pass
    if failure_count:
        sys.exit(-1) 
Example #8
Source File: evaluation.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _test():
    import doctest
    import numpy
    from pyspark.sql import SparkSession
    import pyspark.mllib.evaluation
    try:
        # Numpy 1.14+ changed it's string format.
        numpy.set_printoptions(legacy='1.13')
    except TypeError:
        pass
    globs = pyspark.mllib.evaluation.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("mllib.evaluation tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
Example #9
Source File: fixtures.py    From pytest-spark with MIT License 6 votes vote down vote up
def _spark_session():
    """Internal fixture for SparkSession instance.

    Yields SparkSession instance if it is supported by the pyspark
    version, otherwise yields None.

    Required to correctly initialize `spark_context` fixture after
    `spark_session` fixture.

    ..note::
        It is not possible to create SparkSession from the existing
        SparkContext.
    """

    try:
        from pyspark.sql import SparkSession
    except ImportError:
        yield
    else:
        session = SparkSession.builder \
            .config(conf=SparkConfigBuilder().get()) \
            .getOrCreate()

        yield session
        session.stop() 
Example #10
Source File: spark.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def _load_pyfunc(path):
    """
    Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.

    :param path: Local filesystem path to the MLflow Model with the ``spark`` flavor.
    """
    # NOTE: The getOrCreate() call below may change settings of the active session which we do not
    # intend to do here. In particular, setting master to local[1] can break distributed clusters.
    # To avoid this problem, we explicitly check for an active session. This is not ideal but there
    # is no good workaround at the moment.
    import pyspark

    spark = pyspark.sql.SparkSession._instantiatedSession
    if spark is None:
        spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
            .master("local[1]").getOrCreate()
    return _PyFuncModelWrapper(spark, _load_model(model_uri=path)) 
Example #11
Source File: populate_tables.py    From data-testing-with-airflow with Apache License 2.0 6 votes vote down vote up
def spark():
    spark = SparkSession.builder \
        .config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \
        .config('spark.hadoop.javax.jdo.option.ConnectionURL',
                'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \
        .enableHiveSupport() \
        .getOrCreate()

    # Now populate some tables
    for database_name in ['dev_app', 'tst_app', 'acc_app', 'prd_app', 'transaction_a', 'transaction_b']:
        spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect()
        spark.sql('CREATE DATABASE {0}'.format(database_name)).collect()

    populate_transaction_a(spark)
    populate_transaction_b(spark)

    for environment in ['dev', 'tst', 'acc', 'prd']:
        populate_account_info(spark, environment)
        populate_countries(spark, environment) 
Example #12
Source File: conftest.py    From data-testing-with-airflow with Apache License 2.0 6 votes vote down vote up
def spark(request):
    spark = SparkSession.builder \
        .master('local[*]') \
        .enableHiveSupport() \
        .getOrCreate()

    # Now populate some tables
    for database_name in ['tst_app', 'transaction_a', 'transaction_b']:
        spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect()
        spark.sql('CREATE DATABASE {0}'.format(database_name))

    populate_transaction_a(spark)
    populate_transaction_b(spark)
    populate_account_info(spark)
    populate_countries(spark)

    return spark 
Example #13
Source File: utils.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def _get_or_create_spark_session(jars=None):
    jar_path = jars if jars is not None else _get_mlflow_spark_jar_path()
    return SparkSession.builder \
        .config("spark.jars", jar_path) \
        .master("local[*]") \
        .getOrCreate() 
Example #14
Source File: utils.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def spark_session():
    jar_path = _get_mlflow_spark_jar_path()
    session = SparkSession.builder \
        .config("spark.jars", jar_path) \
        .master("local[*]") \
        .getOrCreate()
    yield session
    session.stop() 
Example #15
Source File: ga_chp_bq_ingest_avro_file.py    From MorphL-Community-Edition with Apache License 2.0 5 votes vote down vote up
def main():
    spark_session = (
        SparkSession.builder
        .appName(APPLICATION_NAME)
        .master(MASTER_URL)
        .config('spark.cassandra.connection.host', MORPHL_SERVER_IP_ADDRESS)
        .config('spark.cassandra.auth.username', MORPHL_CASSANDRA_USERNAME)
        .config('spark.cassandra.auth.password', MORPHL_CASSANDRA_PASSWORD)
        .config('spark.sql.shuffle.partitions', 16)
        .getOrCreate())

    log4j = spark_session.sparkContext._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

    avro_df = (
        spark_session
        .read
        .format('avro')
        .load(LOCAL_AVRO_FILE))

    save_options_ga_chp_bq_features_raw = {
        'keyspace': MORPHL_CASSANDRA_KEYSPACE,
        'table': 'ga_chp_bq_features_raw_t' if TRAINING_OR_PREDICTION ==
        'training' else 'ga_chp_bq_features_raw_p'
    }

    (avro_df
     .withColumn('day_of_data_capture', f.lit(DAY_OF_DATA_CAPTURE))
     .withColumn('website_url', f.lit(WEBSITE_URL))
     .write
     .format('org.apache.spark.sql.cassandra')
     .mode('append')
     .options(**save_options_ga_chp_bq_features_raw)
     .save()) 
Example #16
Source File: test_standardize.py    From datadevops with MIT License 5 votes vote down vote up
def spark():
    """Spark Session fixture
    """
    from pyspark.sql import SparkSession

    spark = SparkSession.builder\
        .master("local[2]")\
        .appName("Unit Testing")\
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    return spark 
Example #17
Source File: test_transform.py    From datadevops with MIT License 5 votes vote down vote up
def spark():
    """Spark Session fixture
    """
    from pyspark.sql import SparkSession

    spark = SparkSession.builder\
        .master("local[2]")\
        .appName("Unit Testing")\
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    return spark 
Example #18
Source File: dml_script.py    From PerfKitBenchmarker with Apache License 2.0 5 votes vote down vote up
def main():
  spark = (SparkSession.builder
           .appName('Setup Spark table')
           .enableHiveSupport()
           .getOrCreate())
  table = 'warehouse'
  table_dir = sys.argv[1]
  # clean up previous table
  spark.sql('drop table if exists ' + table)
  # register new table
  spark.catalog.createTable(table, table_dir, source='parquet') 
Example #19
Source File: resources.py    From dagster with Apache License 2.0 5 votes vote down vote up
def spark_session_from_config(spark_conf=None):
    spark_conf = check.opt_dict_param(spark_conf, 'spark_conf')
    builder = SparkSession.builder
    flat = flatten_dict(spark_conf)
    for key, value in flat:
        builder = builder.config(key, value)

    return builder.getOrCreate() 
Example #20
Source File: spark_table.py    From PerfKitBenchmarker with Apache License 2.0 5 votes vote down vote up
def main():
  spark = (SparkSession.builder
           .appName('Setup Spark tables')
           .enableHiveSupport()
           .getOrCreate())
  root_dir = sys.argv[1]
  tables = sys.argv[2].split(',')
  for table in tables:
    table_dir = os.path.join(root_dir, table)
    # clean up previous table
    spark.sql('drop table if exists ' + table)
    # register new table
    spark.catalog.createTable(table, table_dir, source='parquet') 
Example #21
Source File: util.py    From spark-sklearn with Apache License 2.0 5 votes vote down vote up
def createLocalSparkSession(appName="spark-sklearn"):
    """Generates a :class:`SparkSession` utilizing all local cores
    with the progress bar disabled but otherwise default config."""
    return SparkSession.builder \
                       .master("local[*]") \
                       .appName(appName) \
                       .config("spark.ui.showConsoleProgress", "false") \
                       .getOrCreate() 
Example #22
Source File: test_gapply.py    From spark-sklearn with Apache License 2.0 5 votes vote down vote up
def setUpClass(cls):
        super(GapplyConfTests, cls).setUpClass()
        cls.spark = SparkSession.builder \
                                .config("spark.sql.retainGroupColumns", "false") \
                                .getOrCreate() 
Example #23
Source File: group.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _test():
    import doctest
    from pyspark.sql import Row, SparkSession
    import pyspark.sql.group
    globs = pyspark.sql.group.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.group tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
                                   Row(name='Bob', age=5, height=85)]).toDF()
    globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000),
                                   Row(course="Java",   year=2012, earnings=20000),
                                   Row(course="dotNET", year=2012, earnings=5000),
                                   Row(course="dotNET", year=2013, earnings=48000),
                                   Row(course="Java",   year=2013, earnings=30000)]).toDF()
    globs['df5'] = sc.parallelize([
        Row(training="expert", sales=Row(course="dotNET", year=2012, earnings=10000)),
        Row(training="junior", sales=Row(course="Java",   year=2012, earnings=20000)),
        Row(training="expert", sales=Row(course="dotNET", year=2012, earnings=5000)),
        Row(training="junior", sales=Row(course="dotNET", year=2013, earnings=48000)),
        Row(training="expert", sales=Row(course="Java",   year=2013, earnings=30000))]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.group, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    spark.stop()
    if failure_count:
        sys.exit(-1) 
Example #24
Source File: dl_runner.py    From sparkflow with MIT License 5 votes vote down vote up
def create_testing_spark_session(cls):
        return (SparkSession.builder
                .master('local[2]')
                .appName('sparkflow')
                .getOrCreate()) 
Example #25
Source File: udf.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _create_judf(self):
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        wrapped_func = _wrap_function(sc, self.func, self.returnType)
        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
        judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
        return judf 
Example #26
Source File: test_gapply.py    From spark-sklearn with Apache License 2.0 5 votes vote down vote up
def tearDownClass(cls):
        super(GapplyConfTests, cls).tearDownClass()
        # Creating a new SparkSession here seems confusing, but it is necessary because
        # the config is (for some stupid reason...) cached, which would make it get in
        # the way of other tests that expect a default configuration.
        cls.spark = SparkSession.builder \
                                .config("spark.sql.retainGroupColumns", "true") \
                                .getOrCreate() 
Example #27
Source File: image.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def readImages(self, path, recursive=False, numPartitions=-1,
                   dropImageFailures=False, sampleRatio=1.0, seed=0):
        """
        Reads the directory of images from the local or remote source.

        .. note:: If multiple jobs are run in parallel with different sampleRatio or recursive flag,
            there may be a race condition where one job overwrites the hadoop configs of another.

        .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but
            potentially non-deterministic.

        .. note:: Deprecated in 2.4.0. Use `spark.read.format("image").load(path)` instead and
            this `readImages` will be removed in 3.0.0.

        :param str path: Path to the image directory.
        :param bool recursive: Recursive search flag.
        :param int numPartitions: Number of DataFrame partitions.
        :param bool dropImageFailures: Drop the files that are not valid images.
        :param float sampleRatio: Fraction of the images loaded.
        :param int seed: Random number seed.
        :return: a :class:`DataFrame` with a single column of "images",
               see ImageSchema for details.

        >>> df = ImageSchema.readImages('data/mllib/images/origin/kittens', recursive=True)
        >>> df.count()
        5

        .. versionadded:: 2.3.0
        """
        warnings.warn("`ImageSchema.readImage` is deprecated. " +
                      "Use `spark.read.format(\"image\").load(path)` instead.", DeprecationWarning)
        spark = SparkSession.builder.getOrCreate()
        image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema
        jsession = spark._jsparkSession
        jresult = image_schema.readImages(path, jsession, recursive, numPartitions,
                                          dropImageFailures, float(sampleRatio), seed)
        return DataFrame(jresult, spark._wrapped) 
Example #28
Source File: conftest.py    From data-testing-with-airflow with Apache License 2.0 5 votes vote down vote up
def spark(request):
    """
    Fixture to create the SparkSession.
    """
    spark = SparkSession.builder \
        .appName(APP_NAME) \
        .config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \
        .config('spark.hadoop.javax.jdo.option.ConnectionURL',
                'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \
        .enableHiveSupport() \
        .getOrCreate()

    request.addfinalizer(spark.stop)

    return spark 
Example #29
Source File: spark_jdbc_script.py    From airflow with Apache License 2.0 5 votes vote down vote up
def _create_spark_session(arguments) -> SparkSession:
    return SparkSession.builder \
        .appName(arguments.name) \
        .enableHiveSupport() \
        .getOrCreate() 
Example #30
Source File: test_sparktorch.py    From sparktorch with MIT License 5 votes vote down vote up
def spark():
    return (SparkSession.builder
            .master('local[2]')
            .appName('sparktorch')
            .getOrCreate())