Python pyspark.SparkConf() Examples

The following are 30 code examples of pyspark.SparkConf(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark , or try the search function .
Example #1
Source File: spark_process.py    From dispel4py with Apache License 2.0 7 votes vote down vote up
def run():
    from pyspark import SparkContext, SparkConf

    conf = SparkConf()
    conf.setAppName('dispel4py')
    conf.set("spark.storage.memoryFraction", "0.5")
    sc = SparkContext(
        conf=conf)

    from dispel4py.new import processor
    from dispel4py.utils import load_graph

    args = parse_args()

    graph = load_graph(args.module, args.attr)
    if graph is None:
        return
    graph.flatten()

    inputs = processor.create_inputs(args, graph)

    process(sc, graph, inputs=inputs, args=args) 
Example #2
Source File: test_spark_model_export.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def spark_context():
    conf = pyspark.SparkConf()
    conf.set(key="spark.jars.packages",
             value='ml.combust.mleap:mleap-spark-base_2.11:0.12.0,'
                   'ml.combust.mleap:mleap-spark_2.11:0.12.0')
    max_tries = 3
    for num_tries in range(max_tries):
        try:
            spark = get_spark_session(conf)
            return spark.sparkContext
        except Exception as e:
            if num_tries >= max_tries - 1:
                raise
            _logger.exception(e, "Attempt %s to create a SparkSession failed, retrying..." %
                              num_tries) 
Example #3
Source File: build.py    From sift with MIT License 6 votes vote down vote up
def __call__(self):
        c = SparkConf().setAppName('Build %s' % self.model_name)

        log.info('Using spark master: %s', c.get('spark.master'))
        sc = SparkContext(conf=c)

        kwargs = self.model.prepare(sc)
        m = self.model.build(**kwargs)
        m = self.model.format_items(m)
        m = self.formatter(m)

        if self.output_path:
            log.info("Saving to: %s", self.output_path)
            if os.path.isdir(self.output_path):
                log.warn('Writing over output path: %s', self.output_path)
                shutil.rmtree(self.output_path)
            m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
        elif self.sample > 0:
            print '\n'.join(str(i) for i in m.take(self.sample))

        log.info('Done.') 
Example #4
Source File: taar_dynamo.py    From telemetry-airflow with Mozilla Public License 2.0 6 votes vote down vote up
def main(date, aws_access_key_id, aws_secret_access_key, region, table, sample_rate):

    # Clobber the AWS access credentials
    os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
    os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key

    APP_NAME = "TaarDynamo"
    conf = SparkConf().setAppName(APP_NAME)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    date_obj = datetime.strptime(date, "%Y%m%d") - PATCH_DAYS

    reduction_output = run_etljob(
        spark,
        date_obj,
        region,
        table,
        sample_rate,
        aws_access_key_id,
        aws_secret_access_key,
    )
    pprint(reduction_output) 
Example #5
Source File: spark.py    From pyFTS with GNU General Public License v3.0 6 votes vote down vote up
def create_spark_conf(**kwargs):
    """
    Configure the Spark master node

    :param kwargs:
    :return:
    """
    spark_executor_memory = kwargs.get("spark_executor_memory", "2g")
    spark_driver_memory = kwargs.get("spark_driver_memory", "2g")
    url = kwargs.get("url", SPARK_ADDR)
    app  = kwargs.get("app", 'pyFTS')

    conf = SparkConf()
    conf.setMaster(url)
    conf.setAppName(app)
    conf.set("spark.executor.memory", spark_executor_memory)
    conf.set("spark.driver.memory", spark_driver_memory)
    conf.set("spark.memory.offHeap.enabled",True)
    conf.set("spark.memory.offHeap.size","16g")
    
    return conf 
Example #6
Source File: config.py    From pytest-spark with MIT License 6 votes vote down vote up
def initialize(cls, options_from_ini=None):
        if cls._instance:
            return cls._instance

        from pyspark import SparkConf

        cls._instance = SparkConf()

        cls.options = dict(cls.DEFAULTS)
        if options_from_ini:
            cls.options.update(cls._parse_config(options_from_ini))

        for k, v in cls.options.items():
            cls._instance.set(k, v)

        return cls._instance 
Example #7
Source File: testconfig.py    From SMV with Apache License 2.0 6 votes vote down vote up
def sparkSession(cls):
        if not hasattr(cls, "spark"):
            # We can't use the SparkSession Builder here, since we need to call
            # Scala side's SmvTestHive.createContext to create the HiveTestContext's
            # SparkSession.
            # So we need to
            #   * Create a java_gateway
            #   * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
            #   * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
            #   * Create Scala side HiveTestContext SparkSession
            #   * Create python SparkSession
            jgw = launch_gateway(None)
            jvm = jgw.jvm
            import tempfile
            import getpass
            hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
            sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
                                              .set("spark.sql.hive.metastore.barrierPrefixes",
                                                   "org.apache.spark.sql.hive.execution.PairSerDe")\
                                              .set("spark.sql.warehouse.dir", hivedir)\
                                              .set("spark.ui.enabled", "false")
            sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
            jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
            cls.spark = SparkSession(sc, jss.sparkSession())
        return cls.spark 
Example #8
Source File: finance_similarity.py    From Spark-in-Finance-Quantitative-Investing with Apache License 2.0 6 votes vote down vote up
def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.setMaster('spark://10.21.208.21:7077')
    sc_conf.set('spark.executor.memory', '2g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    print sc_conf.getAll()

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc 
Example #9
Source File: test_ExtractCCLinks.py    From cccatalog with MIT License 6 votes vote down vote up
def setUpClass(cls):
        #load sample warc files
        fh           = open('tests/sample_wat.paths')
        cls.watPaths = fh.readlines()

        #initialize class
        cls.cclinks = CCLinks('CC-MAIN-2018-13', 5)
        cls.cclinks.output = 'tests/output/{}/parquet'.format(cls.cclinks.crawlIndex)

        #remove output directory
        if os.path.exists(cls.cclinks.output):
            shutil.rmtree('tests/output')

        #init pyspark
        conf   = pyspark.SparkConf().setMaster('local[*]').setAppName('Test_ExtractCCLinks')
        cls.sc = pyspark.SparkContext.getOrCreate(conf=conf) 
Example #10
Source File: sparkcc.py    From cc-pyspark with MIT License 6 votes vote down vote up
def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop() 
Example #11
Source File: spark_conf.py    From airflow-pipeline with Apache License 2.0 6 votes vote down vote up
def set_spark_defaults(conf, name='spark-job'):
    """
    Update the configuration dictionary for setting up spark, creating the
    dictionary if does not exist yet
    """
    if not conf:
        conf = dict()

    home = os.path.join('/tmp', str(uuid.uuid4()))
    conf['SparkConfiguration'] = SparkConf()\
        .setMaster('yarn-client')\
        .setAppName(name)\
        .set("spark.sql.shuffle.partitions", "1000")\
        .set("spark.scheduler.revive.interval", "3")\
        .set("spark.task.maxFailures", "0")\
        .set("spark.executorEnv.HOME", home)

    return conf 
Example #12
Source File: spark.py    From qb with MIT License 6 votes vote down vote up
def create_spark_context(app_name="Quiz Bowl", configs=None) -> SparkContext:
    if QB_SPARK_MASTER != "":
        log.info("Spark master is %s" % QB_SPARK_MASTER)
        spark_conf = SparkConf()\
            .set('spark.rpc.message.maxSize', 300)\
            .setAppName(app_name)\
            .setMaster(QB_SPARK_MASTER)
    else:
        spark_conf = SparkConf()\
            .set('spark.rpc.message.maxSize', 300)\
            .setAppName(app_name)
    if configs is not None:
        for key, value in configs:
            if key in ('spark.executor.cores', 'spark.max.cores'):
                if value > QB_MAX_CORES:
                    log.info('Requested {r_cores} cores when the machine only has {n_cores} cores, reducing number of '
                             'cores to {n_cores}'.format(r_cores=value, n_cores=QB_MAX_CORES))
                    value = QB_MAX_CORES
            spark_conf = spark_conf.set(key, value)
    return SparkContext.getOrCreate(spark_conf) 
Example #13
Source File: test_spark.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def spark():
    conf = pyspark.SparkConf()
    return get_spark_session(conf) 
Example #14
Source File: test_spark.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def score_model_as_udf(model_uri, pandas_df, result_type="double"):
    spark = get_spark_session(pyspark.SparkConf())
    spark_df = spark.createDataFrame(pandas_df)
    pyfunc_udf = spark_udf(spark=spark, model_uri=model_uri, result_type=result_type)
    new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
    return [x['prediction'] for x in new_df.collect()] 
Example #15
Source File: ozy_streaming.py    From ozymandias with MIT License 5 votes vote down vote up
def main():
    """Run Spark Streaming"""
    conf = SparkConf()
    sc = SparkContext(appName='Ozymandias', conf=conf)
    sc.setLogLevel('WARN')
    
    with open(ROOT + 'channels.json', 'r') as f:
        channels = json.load(f)
    topics = [t['topic'] for t in channels['channels']]
    
    n_secs = 0.5
    ssc = StreamingContext(sc, n_secs)
    stream = KafkaUtils.createDirectStream(ssc, topics, {
                        'bootstrap.servers':'localhost:9092', 
                        'group.id':'ozy-group', 
                        'fetch.message.max.bytes':'15728640',
                        'auto.offset.reset':'largest'})
    
    stream.map(
            deserializer
        ).map(
            image_detector
        ).foreachRDD(
            message_sender)
    
    ssc.start()
    ssc.awaitTermination() 
Example #16
Source File: taar_dynamo.py    From python_mozetl with MIT License 5 votes vote down vote up
def main(date, region, table, prod_iam_role, sample_rate):
    APP_NAME = "HBaseAddonRecommenderView"
    conf = SparkConf().setAppName(APP_NAME)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    date_obj = datetime.strptime(date, "%Y%m%d")

    if prod_iam_role.strip() == "":
        prod_iam_role = None

    reduction_output = run_etljob(
        spark, date_obj, region, table, prod_iam_role, sample_rate
    )
    pprint(reduction_output) 
Example #17
Source File: holoclean.py    From HoloClean-Legacy-deprecated with Apache License 2.0 5 votes vote down vote up
def _init_spark(self):
        """
        Set spark configuration

        :return: Spark session
        :return: Spark context
        """
        conf = SparkConf()

        # Link PG driver to Spark
        conf.set("spark.executor.extraClassPath",
                 self.holoclean_path + "/" + self.pg_driver)
        conf.set("spark.driver.extraClassPath",
                 self.holoclean_path + "/" + self.pg_driver)

        conf.set('spark.driver.memory', '20g')
        conf.set('spark.executor.memory', '20g')
        conf.set("spark.network.timeout", "6000")
        conf.set("spark.rpc.askTimeout", "99999")
        conf.set("spark.worker.timeout", "60000")
        conf.set("spark.driver.maxResultSize", '70g')
        conf.set("spark.ui.showConsoleProgress", "false")

        if self.spark_cluster:
            conf.set("spark.master", self.spark_cluster)

        # Gets Spark context
        sc = SparkContext(conf=conf)
        sc.setLogLevel("OFF")
        sql_ctxt = SQLContext(sc)
        return sql_ctxt.sparkSession, sql_ctxt 
Example #18
Source File: hyperparameters_tuning.py    From intro_ds with Apache License 2.0 5 votes vote down vote up
def startSpark():
    """
    创建SparkContext,这是Spark程序的入口
    """
    conf = SparkConf().setAppName("grid search example")
    sc = SparkContext(conf=conf)
    return sc 
Example #19
Source File: reagent_sql_test_base.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def getConf(self):
        conf = SparkConf()
        for k, v in DEFAULT_SPARK_CONFIG.items():
            conf.set(k, v)
        return conf 
Example #20
Source File: ClimatologySpark2.py    From incubator-sdap-nexus with Apache License 2.0 5 votes vote down vote up
def configureSpark(sparkConfig, appName, memoryPerExecutor='4G', coresPerExecutor=1):
    mode, numExecutors, numPartitions = sparkConfig.split(',')
    numExecutors = int(numExecutors)
    print >> sys.stderr, 'numExecutors = ', numExecutors
    numPartitions = int(numPartitions)
    print >> sys.stderr, 'numPartitions = ', numPartitions
    if mode == 'multicore':
        print >> sys.stderr, 'Using pysparkling'
        import pysparkling
        sc = pysparkling.Context()
    else:
        print >> sys.stderr, 'Using PySpark'
        sparkMaster = mode
        spConf = SparkConf()
        spConf.setAppName(appName)
        spConf.set("spark.executorEnv.HOME",
                   os.path.join(os.getenv('HOME'), 'spark_exec_home'))
        spConf.set("spark.executorEnv.PYTHONPATH", os.getcwd())
        spConf.set("spark.executor.memory", memoryPerExecutor)
        print >> sys.stderr, 'memoryPerExecutor = ', memoryPerExecutor
        try:
            sparkMaster = SparkMasterOverride
        except:
            pass
        if sparkMaster[:5] == "mesos":
            spConf.set("spark.cores.max", numExecutors)
        else:
            # Spark master is YARN or local[N]
            spConf.set("spark.executor.instances", numExecutors)
            spConf.set("spark.executor.cores", coresPerExecutor)
            spConf.setMaster(sparkMaster)
        sc = SparkContext(conf=spConf)
    return sc, numExecutors, numPartitions 
Example #21
Source File: test.py    From TensorFlowOnSpark with Apache License 2.0 5 votes vote down vote up
def setUpClass(cls):
    master = os.getenv('MASTER')
    assert master is not None, "Please start a Spark standalone cluster and export MASTER to your env."

    num_workers = os.getenv('SPARK_WORKER_INSTANCES')
    assert num_workers is not None, "Please export SPARK_WORKER_INSTANCES to your env."
    cls.num_workers = int(num_workers)

    spark_jars = os.getenv('SPARK_CLASSPATH')
    assert spark_jars, "Please add path to tensorflow/ecosystem/hadoop jar to SPARK_CLASSPATH."

    cls.conf = SparkConf().set('spark.jars', spark_jars)
    cls.sc = SparkContext(master, cls.__name__, conf=cls.conf)
    cls.spark = SparkSession.builder.getOrCreate() 
Example #22
Source File: taar_ensemble.py    From telemetry-airflow with Mozilla Public License 2.0 5 votes vote down vote up
def main(
    date,
    aws_access_key_id,
    aws_secret_access_key,
    bucket,
    prefix,
    elastic_net_param,
    reg_param,
    min_installed_addons,
    client_sample_date_from,
    sample_rate,
):
    print("Sampling clients since {}".format(client_sample_date_from))

    # Clobber the AWS access credentials
    os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
    os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key

    ctx = default_context()

    APP_NAME = "TaarEnsemble"
    conf = SparkConf().setAppName(APP_NAME)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    taar_training = extract(
        spark, client_sample_date_from, min_installed_addons, sample_rate
    )
    coefs = transform(ctx, spark, taar_training, reg_param, elastic_net_param)
    load(coefs, date, prefix, bucket) 
Example #23
Source File: tests.py    From pyspark-cassandra with Apache License 2.0 5 votes vote down vote up
def setUpClass(cls):

        # connect to cassandra and create a keyspace for testing
        cls.session = Cluster().connect()
        cls.session.execute('''
            CREATE KEYSPACE IF NOT EXISTS %s WITH
            replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
        ''' % (cls.keyspace,))
        cls.session.set_keyspace(CassandraTestCase.keyspace)

        # create a cassandra spark context
        cls.sc = CassandraSparkContext(
            conf=SparkConf().setAppName("PySpark Cassandra Test")) 
Example #24
Source File: tasks.py    From flask-spark-docker with MIT License 5 votes vote down vote up
def create_task(words):
    conf = SparkConf().setAppName('letter count')
    sc = SparkContext(conf=conf)
    seq = words.split()
    data = sc.parallelize(seq)
    counts = data.map(lambda word: (word, 1)).reduceByKey(add).collect()
    sc.stop()
    return dict(counts) 
Example #25
Source File: context.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def getOrCreate(cls, checkpointPath, setupFunc):
        """
        Either recreate a StreamingContext from checkpoint data or create a new StreamingContext.
        If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be
        recreated from the checkpoint data. If the data does not exist, then the provided setupFunc
        will be used to create a new context.

        @param checkpointPath: Checkpoint directory used in an earlier streaming program
        @param setupFunc:      Function to create a new context and setup DStreams
        """
        cls._ensure_initialized()
        gw = SparkContext._gateway

        # Check whether valid checkpoint information exists in the given path
        ssc_option = gw.jvm.StreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath)
        if ssc_option.isEmpty():
            ssc = setupFunc()
            ssc.checkpoint(checkpointPath)
            return ssc

        jssc = gw.jvm.JavaStreamingContext(ssc_option.get())

        # If there is already an active instance of Python SparkContext use it, or create a new one
        if not SparkContext._active_spark_context:
            jsc = jssc.sparkContext()
            conf = SparkConf(_jconf=jsc.getConf())
            SparkContext(conf=conf, gateway=gw, jsc=jsc)

        sc = SparkContext._active_spark_context

        # update ctx in serializer
        cls._transformerSerializer.ctx = sc
        return StreamingContext(sc, None, jssc) 
Example #26
Source File: launcher.py    From spylon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def spark_context(self, application_name):
        """Create a spark context given the parameters configured in this class.

        The caller is responsible for calling ``.close`` on the resulting spark context

        Parameters
        ----------
        application_name : string

        Returns
        -------
        sc : SparkContext
        """

        # initialize the spark configuration
        self._init_spark()
        import pyspark
        import pyspark.sql

        # initialize conf
        spark_conf = pyspark.SparkConf()
        for k, v in self._spark_conf_helper._conf_dict.items():
            spark_conf.set(k, v)

        log.info("Starting SparkContext")
        return pyspark.SparkContext(appName=application_name, conf=spark_conf) 
Example #27
Source File: conftest.py    From elephas with MIT License 5 votes vote down vote up
def sql_context(request):
    """ fixture for creating a Spark SQLContext
    Args:
        request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local[2]").setAppName(
        "pytest-pyspark-local-testing"))
    sc = SparkContext(conf=conf)
    sql_context = SQLContext(sc)
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sql_context 
Example #28
Source File: conftest.py    From elephas with MIT License 5 votes vote down vote up
def spark_context(request):
    """ fixture for creating a SparkContext
    Args:
        request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local[2]").setAppName(
        "pytest-pyspark-local-testing"))
    sc = SparkContext(conf=conf)
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sc 
Example #29
Source File: test_deeds.py    From cccatalog with MIT License 5 votes vote down vote up
def spark_context(request):
     conf = (SparkConf()
            .setMaster("spark://ec2-54-167-211-230.compute-1.amazonaws.com:7077")
            .setAppName("commonsmapper-pyspark-local-testing")
            .set ("spark.jars", "../jars/hadoop-aws-2.8.1.jar,../jars/hadoop-auth-2.8.1.jar,../jars/aws-java-sdk-1.11.212.jar,../jars/postgresql-42.1.4.jar")
            .set ("spark.driver.extraClassPath", "../jars/")
            )
     sc = SparkContext(conf=conf)
     sc._jsc.hadoopConfiguration ().set("fs.s3n.awsAccessKeyId", os.environ ['OPEN_LEDGER_ACCESS_KEY_ID'])
     sc._jsc.hadoopConfiguration ().set("fs.s3n.awsSecretAccessKey", os.environ ['OPEN_LEDGER_SECRET_ACCESS_KEY'])
     request.addfinalizer(lambda: sc.stop())
     return sc 
Example #30
Source File: benchmark_spark.py    From implicit with MIT License 5 votes vote down vote up
def benchmark_spark(ratings, factors, iterations=5):
    conf = (SparkConf()
            .setAppName("implicit_benchmark")
            .setMaster('local[*]')
            .set('spark.driver.memory', '16G')
            )
    context = SparkContext(conf=conf)
    spark = SparkSession(context)

    times = {}
    try:
        ratings = convert_sparse_to_dataframe(spark, context, ratings)

        for rank in factors:
            als = ALS(rank=rank, maxIter=iterations,
                      alpha=1, implicitPrefs=True,
                      userCol="row", itemCol="col", ratingCol="data")
            start = time.time()
            als.fit(ratings)
            elapsed = time.time() - start
            times[rank] = elapsed / iterations
            print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
    finally:
        spark.stop()

    return times