Python pyspark.SparkContext() Examples

The following are 30 code examples of pyspark.SparkContext(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark , or try the search function .
Example #1
Source File: app.py    From integrations-core with BSD 3-Clause "New" or "Revised" License 8 votes vote down vote up
def main():
    # Adapted from https://github.com/apache/spark/tree/master/examples/src/main/python/streaming
    sc = SparkContext(appName='PythonStreamingQueue')
    ssc = StreamingContext(sc, 1)

    # Create the queue through which RDDs can be pushed to
    # a QueueInputDStream
    rddQueue = []
    for _ in range(5):
        rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]

    # Create the QueueInputDStream and use it do some processing
    inputStream = ssc.queueStream(rddQueue)
    mappedStream = inputStream.map(lambda x: (x % 10, 1))
    reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
    reducedStream.pprint()

    ssc.start()
    time.sleep(6)
    ssc.stop(stopSparkContext=True, stopGraceFully=True) 
Example #2
Source File: spark_process.py    From dispel4py with Apache License 2.0 7 votes vote down vote up
def run():
    from pyspark import SparkContext, SparkConf

    conf = SparkConf()
    conf.setAppName('dispel4py')
    conf.set("spark.storage.memoryFraction", "0.5")
    sc = SparkContext(
        conf=conf)

    from dispel4py.new import processor
    from dispel4py.utils import load_graph

    args = parse_args()

    graph = load_graph(args.module, args.attr)
    if graph is None:
        return
    graph.flatten()

    inputs = processor.create_inputs(args, graph)

    process(sc, graph, inputs=inputs, args=args) 
Example #3
Source File: bluecoat.py    From incubator-spot with Apache License 2.0 6 votes vote down vote up
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
    """
    Parse and save bluecoat logs.

    :param zk: Apache ZooKeeper quorum
    :param topic: Apache Kafka topic (application name)
    :param db: Apache Hive database to save into
    :param db_table: table of `db` to save into
    :param num_of_workers: number of Apache Kafka workers
    :param batch_size: batch size for Apache Spark streaming context
    """
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc, int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
    saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
    ssc.start()
    ssc.awaitTermination() 
Example #4
Source File: common.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = [_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data)
    return obj 
Example #5
Source File: cassandra_example.py    From pyspark-cassandra with Apache License 2.0 6 votes vote down vote up
def main():
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>"
        sys.exit(-1)

    keyspace_name = sys.argv[1]
    column_family_name = sys.argv[2]

    # Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md
    conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1")

    sc = SparkContext(appName="Spark + Cassandra Example",
                      conf=conf)

    # import time; time.sleep(30)
    java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil")
    print sc._jvm.CassandraJavaUtil

    users = (
        ["Mike", "Sukmanowsky"],
        ["Andrew", "Montalenti"],
        ["Keith", "Bourgoin"],
    )
    rdd = sc.parallelize(users)
    print rdd.collect() 
Example #6
Source File: launcher.py    From spylon with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def sql_context(self, application_name):
        """Create a spark context given the parameters configured in this class.

        The caller is responsible for calling ``.close`` on the resulting spark context

        Parameters
        ----------
        application_name : string

        Returns
        -------
        sc : SparkContext
        """
        sc = self.spark_context(application_name)
        import pyspark
        sqlContext = pyspark.SQLContext(sc)
        return (sc, sqlContext) 
Example #7
Source File: conftest.py    From maggy with Apache License 2.0 6 votes vote down vote up
def sc(request):
    """ fixture for creating a spark context
    Args:
        request: pytest.FixtureRequest object
    """

    assert (
        request.config.getoption("--spark-master") is not None
    ), 'No Spark Master Address provided, use --spark-master: "spark://host:port" '

    conf = (
        SparkConf()
        .setMaster(request.config.getoption("--spark-master"))
        .setAppName("pytest-pyspark-local-testing")
        .set("spark.dynamicAllocation.maxExecutors", 2)
        .set("spark.executor.instances", 2)
    )
    scont = SparkContext(conf=conf)
    request.addfinalizer(lambda: scont.stop())

    quiet_py4j()
    return scont 
Example #8
Source File: sparkline.py    From iLID with MIT License 6 votes vote down vote up
def main(args):
  window_size = 600
  files = filecollector.collect(args.input_path)

  sc = SparkContext("local", "sparkline")
  pipeline = (
    sc.parallelize(files, 4)
    .map(lambda f: read_wav(f))
    .flatMap(lambda (f, signal, samplerate): sliding_audio(f, signal, samplerate))
    .map(lambda (f, signal, samplerate): downsample(f, signal, samplerate))
    .map(lambda (f, signal, samplerate): apply_melfilter(f, signal, samplerate))
    .map(lambda (f, image): (f, graphic.colormapping.to_grayscale(image, bytes=True)))
    .map(lambda (f, image): (f, graphic.histeq.histeq(image)))
    .map(lambda (f, image): (f, graphic.histeq.clamp_and_equalize(image)))
    .map(lambda (f, image): (f, graphic.windowing.cut_or_pad_window(image, window_size)))
    .map(lambda (f, image): output.image.save(f, image, args.output_path))
  )

  pipeline.collect()

#.map(lambda (f, signal, samplerate): generate_spectrograms(f, signal, samplerate)) 
Example #9
Source File: drybell_spark.py    From snorkel-tutorials with Apache License 2.0 6 votes vote down vote up
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}") 
Example #10
Source File: finance_similarity.py    From Spark-in-Finance-Quantitative-Investing with Apache License 2.0 6 votes vote down vote up
def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.setMaster('spark://10.21.208.21:7077')
    sc_conf.set('spark.executor.memory', '2g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    print sc_conf.getAll()

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc 
Example #11
Source File: testconfig.py    From SMV with Apache License 2.0 6 votes vote down vote up
def sparkSession(cls):
        if not hasattr(cls, "spark"):
            # We can't use the SparkSession Builder here, since we need to call
            # Scala side's SmvTestHive.createContext to create the HiveTestContext's
            # SparkSession.
            # So we need to
            #   * Create a java_gateway
            #   * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
            #   * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
            #   * Create Scala side HiveTestContext SparkSession
            #   * Create python SparkSession
            jgw = launch_gateway(None)
            jvm = jgw.jvm
            import tempfile
            import getpass
            hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
            sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
                                              .set("spark.sql.hive.metastore.barrierPrefixes",
                                                   "org.apache.spark.sql.hive.execution.PairSerDe")\
                                              .set("spark.sql.warehouse.dir", hivedir)\
                                              .set("spark.ui.enabled", "false")
            sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
            jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
            cls.spark = SparkSession(sc, jss.sparkSession())
        return cls.spark 
Example #12
Source File: sparkcc.py    From cc-pyspark with MIT License 6 votes vote down vote up
def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop() 
Example #13
Source File: fixtures.py    From pytest-spark with MIT License 6 votes vote down vote up
def _spark_session():
    """Internal fixture for SparkSession instance.

    Yields SparkSession instance if it is supported by the pyspark
    version, otherwise yields None.

    Required to correctly initialize `spark_context` fixture after
    `spark_session` fixture.

    ..note::
        It is not possible to create SparkSession from the existing
        SparkContext.
    """

    try:
        from pyspark.sql import SparkSession
    except ImportError:
        yield
    else:
        session = SparkSession.builder \
            .config(conf=SparkConfigBuilder().get()) \
            .getOrCreate()

        yield session
        session.stop() 
Example #14
Source File: construct.py    From bolt with Apache License 2.0 6 votes vote down vote up
def _argcheck(*args, **kwargs):
        """
        Check that arguments are consistent with spark array construction.

        Conditions are:
        (1) a positional argument is a SparkContext
        (2) keyword arg 'context' is a SparkContext
        (3) an argument is a BoltArraySpark, or
        (4) an argument is a nested list containing a BoltArraySpark
        """
        try:
            from pyspark import SparkContext
        except ImportError:
            return False

        cond1 = any([isinstance(arg, SparkContext) for arg in args])
        cond2 = isinstance(kwargs.get('context', None), SparkContext)
        cond3 = any([isinstance(arg, BoltArraySpark) for arg in args])
        cond4 = any([any([isinstance(sub, BoltArraySpark) for sub in arg])
                     if isinstance(arg, (tuple, list)) else False for arg in args])
        return cond1 or cond2 or cond3 or cond4 
Example #15
Source File: fixtures.py    From pytest-spark with MIT License 6 votes vote down vote up
def spark_context(_spark_session):
    """Return a SparkContext instance with reduced logging
    (session scope).
    """

    if _spark_session is None:
        from pyspark import SparkContext

        # pyspark 1.x: create SparkContext instance
        sc = SparkContext(conf=SparkConfigBuilder().get())
    else:
        # pyspark 2.x: get SparkContext from SparkSession fixture
        sc = _spark_session.sparkContext

    reduce_logging(sc)
    yield sc

    if _spark_session is None:
        sc.stop()  # pyspark 1.x: stop SparkContext instance 
Example #16
Source File: spark_conf.py    From airflow-pipeline with Apache License 2.0 6 votes vote down vote up
def get_spark_context(conf):
    """Get the spark context for submitting pyspark applications"""
    spark_context = None
    try:
        spark_context = SparkContext(conf=conf)

        from fncore.utils.zip_py_module import zip_py

        import fncore
        spark_context.addPyFile(zip_py(os.path.dirname(fncore.__file__)))
        import py2neo
        spark_context.addPyFile(zip_py(os.path.dirname(py2neo.__file__)))

        yield spark_context
    except:
        raise
    finally:
        if spark_context:
            spark_context.stop() 
Example #17
Source File: build.py    From sift with MIT License 6 votes vote down vote up
def __call__(self):
        c = SparkConf().setAppName('Build %s' % self.model_name)

        log.info('Using spark master: %s', c.get('spark.master'))
        sc = SparkContext(conf=c)

        kwargs = self.model.prepare(sc)
        m = self.model.build(**kwargs)
        m = self.model.format_items(m)
        m = self.formatter(m)

        if self.output_path:
            log.info("Saving to: %s", self.output_path)
            if os.path.isdir(self.output_path):
                log.warn('Writing over output path: %s', self.output_path)
                shutil.rmtree(self.output_path)
            m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
        elif self.sample > 0:
            print '\n'.join(str(i) for i in m.take(self.sample))

        log.info('Done.') 
Example #18
Source File: test.py    From TensorFlowOnSpark with Apache License 2.0 5 votes vote down vote up
def setUpClass(cls):
    master = os.getenv('MASTER')
    assert master is not None, "Please start a Spark standalone cluster and export MASTER to your env."

    num_workers = os.getenv('SPARK_WORKER_INSTANCES')
    assert num_workers is not None, "Please export SPARK_WORKER_INSTANCES to your env."
    cls.num_workers = int(num_workers)

    spark_jars = os.getenv('SPARK_CLASSPATH')
    assert spark_jars, "Please add path to tensorflow/ecosystem/hadoop jar to SPARK_CLASSPATH."

    cls.conf = SparkConf().set('spark.jars', spark_jars)
    cls.sc = SparkContext(master, cls.__name__, conf=cls.conf)
    cls.spark = SparkSession.builder.getOrCreate() 
Example #19
Source File: testing.py    From sparkit-learn with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        class_name = self.__class__.__name__
        self.sc = SparkContext('local[2]', class_name)
        self.sc._jvm.System.setProperty("spark.ui.showConsoleProgress", "false")
        log4j = self.sc._jvm.org.apache.log4j
        log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL) 
Example #20
Source File: conftest.py    From bolt with Apache License 2.0 5 votes vote down vote up
def sc():
    from pyspark import SparkContext
    sc = SparkContext(appName="bolt-tests", master="local[2]")
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)
    return sc 
Example #21
Source File: classification.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _test():
    import doctest
    globs = globals().copy()
    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1) 
Example #22
Source File: _common.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _test():
    import doctest
    globs = globals().copy()
    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1) 
Example #23
Source File: blizzard2012.py    From tacotron2 with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def text_and_path_rdd(self, sc: SparkContext):
        return sc.parallelize(
            self._extract_all_text_and_path()) 
Example #24
Source File: conftest.py    From example_dataproc_twitter with MIT License 5 votes vote down vote up
def spark_context():
    py_files = ['dataproc/jobs/base.py',
                'dataproc/jobs/dimsum.py'] 
    sc = pyspark.SparkContext(pyFiles=py_files)
    yield sc
    sc.stop() 
Example #25
Source File: run_jobs.py    From example_dataproc_twitter with MIT License 5 votes vote down vote up
def main():
    alg = get_alg(sys.argv[1:]).algorithm
    if alg:
        job = JobsFactory._factor_alg(alg)()
        args = job.process_base_sysargs(
            [e for e in sys.argv[1:] if 'algorithm' not in e])
        with pyspark.SparkContext() as sc:
            job.run(sc, args) 
Example #26
Source File: run_jobs.py    From example_dataproc_twitter with MIT License 5 votes vote down vote up
def main():
    alg = get_alg(sys.argv[1:]).algorithm
    if alg:
        job = JobsFactory._factor_alg(alg)()
        args = job.process_base_sysargs(
            [e for e in sys.argv[1:] if 'algorithm' not in e])
        with pyspark.SparkContext() as sc:
            job.run(sc, args) 
Example #27
Source File: listener.py    From incubator-spot with Apache License 2.0 5 votes vote down vote up
def streaming_listener(**kwargs):
    '''
        Initialize the Spark job.
    '''
    Util.get_logger('SPOT.INGEST', kwargs.pop('log_level'))

    logger  = logging.getLogger('SPOT.INGEST.COMMON.LISTENER')
    logger.info('Initializing Spark Streaming Listener...')

    dbtable = '{0}.{1}'.format(kwargs.pop('database'), kwargs['type'])
    topic   = kwargs.pop('topic')

    sc      = SparkContext(appName=kwargs['app_name'] or topic)
    logger.info('Connect to Spark Cluster as job "{0}" and broadcast variables on it.'
        .format(kwargs.pop('app_name') or topic))
    ssc     = StreamingContext(sc, batchDuration=kwargs['batch_duration'])
    logger.info('Streaming data will be divided into batches of {0} seconds.'
        .format(kwargs.pop('batch_duration')))
    hsc     = HiveContext(sc)
    logger.info('Read Hive\'s configuration to integrate with data stored in it.')

    import pipelines
    module  = getattr(pipelines, kwargs.pop('type'))
    stream  = module.StreamPipeline(ssc, kwargs.pop('zkquorum'),
                kwargs.pop('group_id') or topic, { topic: int(kwargs.pop('partitions')) })

    schema  = stream.schema
    segtype = stream.segtype

    stream.dstream\
        .map(lambda x: module.StreamPipeline.parse(x))\
        .filter(lambda x: bool(x))\
        .foreachRDD(lambda x: store(x, hsc, dbtable, topic, schema, segtype))

    ssc.start()
    logger.info('Start the execution of the streams.')
    ssc.awaitTermination() 
Example #28
Source File: ljspeech.py    From tacotron2 with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def text_and_path_rdd(self, sc: SparkContext):
        return sc.parallelize(
            self._extract_all_text_and_path()) 
Example #29
Source File: ClimatologySpark2.py    From incubator-sdap-nexus with Apache License 2.0 5 votes vote down vote up
def configureSpark(sparkConfig, appName, memoryPerExecutor='4G', coresPerExecutor=1):
    mode, numExecutors, numPartitions = sparkConfig.split(',')
    numExecutors = int(numExecutors)
    print >> sys.stderr, 'numExecutors = ', numExecutors
    numPartitions = int(numPartitions)
    print >> sys.stderr, 'numPartitions = ', numPartitions
    if mode == 'multicore':
        print >> sys.stderr, 'Using pysparkling'
        import pysparkling
        sc = pysparkling.Context()
    else:
        print >> sys.stderr, 'Using PySpark'
        sparkMaster = mode
        spConf = SparkConf()
        spConf.setAppName(appName)
        spConf.set("spark.executorEnv.HOME",
                   os.path.join(os.getenv('HOME'), 'spark_exec_home'))
        spConf.set("spark.executorEnv.PYTHONPATH", os.getcwd())
        spConf.set("spark.executor.memory", memoryPerExecutor)
        print >> sys.stderr, 'memoryPerExecutor = ', memoryPerExecutor
        try:
            sparkMaster = SparkMasterOverride
        except:
            pass
        if sparkMaster[:5] == "mesos":
            spConf.set("spark.cores.max", numExecutors)
        else:
            # Spark master is YARN or local[N]
            spConf.set("spark.executor.instances", numExecutors)
            spConf.set("spark.executor.cores", coresPerExecutor)
            spConf.setMaster(sparkMaster)
        sc = SparkContext(conf=spConf)
    return sc, numExecutors, numPartitions 
Example #30
Source File: pixelStats.py    From incubator-sdap-nexus with Apache License 2.0 5 votes vote down vote up
def pixelStats(urls, variable, nPartitions, timeFromFilename=TimeFromFilenameDOY, groupByKeys=GroupByKeys, accumulators=Accumulators,
               cachePath=CachePath, mode='dpark', modes=Modes):
    '''Compute a global (or regional) pixel mean field in parallel, given a list of URL's pointing to netCDF files.'''
    baseKey = groupByKeys[0]
    if baseKey == 'month':
        urlsByKey = splitByMonth(urls, timeFromFilename)
    else:
        print >>sys.stderr, 'pixelStats: Unrecognized groupByKey "%s".  Must be in %s' % (baseKey, str(groupByKeys))
        sys.exit(1)

    if mode == 'sequential':
        accum = [accumulate(u, variable, accumulators) for u in urlsByKey]
        merged = reduce(combine, accum)
        stats = statsFromAccumulators(merged)

    elif mode == 'dpark':
        import dpark
        urls = dpark.parallelize(urlsByKey, nPartitions)                          # returns RDD of URL lists
        accum = urls.map(lambda urls: accumulate(urls, variable, accumulators))   # returns RDD of stats accumulators
        merged = accum.reduce(combine)                                            # merged accumulators on head node
        stats = statsFromAccumulators(merged)                                     # compute final stats from accumulators

    elif mode == 'spark':
        from pyspark import SparkContext
        sc = SparkContext(appName="PixelStats")
        urls = sc.parallelize(urlsByKey, nPartitions)                             # returns RDD of URL lists
        accum = urls.map(lambda urls: accumulate(urls, variable, accumulators))   # returns RDD of stats accumulators
        merged = accum.reduce(combine)                                            # merged accumulators on head node
        stats = statsFromAccumulators(merged)                                     # compute final stats from accumulators

    else:
        stats = None
        if mode not in modes:
            print >>sys.stderr, 'pixelStats: Unrecognized mode  "%s".  Must be in %s' % (mode, str(modes))
            sys.exit(1)
    return stats