Python pyspark.streaming.kafka.KafkaUtils.createStream() Examples

The following are code examples for showing how to use pyspark.streaming.kafka.KafkaUtils.createStream(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: apache-spot-60-min   Author: hdulay   File: bluecoat.py    Apache License 2.0 7 votes vote down vote up
def bluecoat_parse(zk,topic,db,db_table,num_of_workers,batch_size):
    
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc,int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1])\
        .flatMap(lambda row: row.split("\n"))\
        .filter(lambda row: rex_date.match(row))\
        .map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " "))\
        .map(lambda row:  split_log_entry(row))\
        .filter(lambda row: len(row) > 23)\
        .map(lambda row: proxy_parser(row))

    saved_data = proxy_data.foreachRDD(lambda row: save_data(row,sqc,db,db_table,topic))
    ssc.start()
    ssc.awaitTermination() 
Example 2
Project: incubator-spot   Author: apache   File: streaming.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, ssc, zkQuorum, groupId, topics):
        from common.serializer       import deserialize
        from pyspark.streaming.kafka import KafkaUtils

        self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
                        keyDecoder=lambda x: x, valueDecoder=deserialize) 
Example 3
Project: incubator-spot   Author: apache   File: streaming.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, ssc, zkQuorum, groupId, topics):
        from common.serializer       import deserialize
        from pyspark.streaming.kafka import KafkaUtils

        self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
                            keyDecoder=lambda x: x, valueDecoder=deserialize) 
Example 4
Project: incubator-spot   Author: apache   File: bluecoat.py    Apache License 2.0 5 votes vote down vote up
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
    """
    Parse and save bluecoat logs.

    :param zk: Apache ZooKeeper quorum
    :param topic: Apache Kafka topic (application name)
    :param db: Apache Hive database to save into
    :param db_table: table of `db` to save into
    :param num_of_workers: number of Apache Kafka workers
    :param batch_size: batch size for Apache Spark streaming context
    """
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc, int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
    saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
    ssc.start()
    ssc.awaitTermination() 
Example 5
Project: incubator-spot   Author: apache   File: streaming.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, ssc, zkQuorum, groupId, topics):
        from common.serializer       import deserialize
        from pyspark.streaming.kafka import KafkaUtils

        self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
                            keyDecoder=lambda x: x, valueDecoder=deserialize) 
Example 6
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_kafka_stream(self):
        """Test the Python Kafka stream API."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 5, "c": 10}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
                                         "test-streaming-consumer", {topic: 1},
                                         {"auto.offset.reset": "smallest"})
        self._validateStreamResult(sendData, stream) 
Example 7
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def _startContext(self, n, compressed):
        # Start the StreamingContext and also collect the result
        dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(),
                                          enableDecompression=compressed)
        result = []

        def get_output(_, rdd):
            for event in rdd.collect():
                if len(result) < n:
                    result.append(event)
        dstream.foreachRDD(get_output)
        self.ssc.start()
        return result 
Example 8
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_kinesis_stream_api(self):
        # Don't start the StreamingContext because we cannot test it in Jenkins
        kinesisStream1 = KinesisUtils.createStream(
            self.ssc, "myAppNam", "mySparkStream",
            "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
            InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2)
        kinesisStream2 = KinesisUtils.createStream(
            self.ssc, "myAppNam", "mySparkStream",
            "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
            InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2,
            "awsAccessKey", "awsSecretKey") 
Example 9
Project: ts-engine   Author: bwsw   File: input_module.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, config, file_config):

        self._server = config.content["input"]["options"]["server"]
        self._port = config.content["input"]["options"]["port"]
        self._topic = config.content["input"]["options"]["topic"]
        self._consumer_group = config.content["input"]["options"]["consumer_group"]
        self._batchDuration = config.content["input"]["options"]["batchDuration"]
        self._sep = config.content["input"]["options"]["sep"]

        self._spark = SparkSession.builder.appName("StreamingDataKafka").getOrCreate()
        sc = self._spark.sparkContext

        # database files registration
        for name, file in config.content["databases"].items():
            sc.addFile(file)

        # configuration file registration
        sc.addFile(file_config)

        self._ssc = StreamingContext(sc, self._batchDuration)

        list_conversion_function = list((map(lambda x: type_to_func(x.dataType), config.data_structure_pyspark)))
        ranked_pointer = list(enumerate(list_conversion_function))
        functions_list = list(map(lambda x: lambda list_string: x[1](list_string[x[0]]), ranked_pointer))
        function_convert = lambda x: list(map(lambda func: func(x), functions_list))
        try:
            dstream = KafkaUtils.createStream(
                self._ssc,
                "{0}:{1}".format(self._server, self._port),
                self._consumer_group,
                {self._topic: 1})
            self._dstream = dstream.map(lambda x: function_convert(x[1].split(",")))
        except:
            raise KafkaConnectError("Kafka error: Connection refused: server={} port={} consumer_group={} topic={}".
                                    format(self._server, self._port, self._consumer_group, self._topic)) 
Example 10
Project: sahara-tests   Author: openstack   File: spark-kafka-example.py    Apache License 2.0 5 votes vote down vote up
def main():
    if len(sys.argv) != 4:
        print("Usage: kafka_wordcount.py <zk> <topic> <timeout>",
              file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)
    timeout = None
    if len(sys.argv) == 4:
        zk, topic, timeout = sys.argv[1:]
        timeout = int(timeout)
    else:
        zk, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(
        ssc, zk, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: (line.split(" "))
                           .map(lambda word: (word, 1))
                           .reduceByKey(lambda a, b: a + b))
    counts.pprint()
    kwargs = {}
    if timeout:
        kwargs['timeout'] = timeout
    ssc.start()
    ssc.awaitTermination(**kwargs) 
Example 11
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 4 votes vote down vote up
def test_kinesis_stream(self):
        if not are_kinesis_tests_enabled:
            sys.stderr.write(
                "Skipped test_kinesis_stream (enable by setting environment variable %s=1"
                % kinesis_test_environ_var)
            return

        import random
        kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000)))
        kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(2)
        try:
            kinesisTestUtils.createStream()
            aWSCredentials = kinesisTestUtils.getAWSCredentials()
            stream = KinesisUtils.createStream(
                self.ssc, kinesisAppName, kinesisTestUtils.streamName(),
                kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(),
                InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY,
                aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey())

            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            stream.foreachRDD(get_output)
            self.ssc.start()

            testData = [i for i in range(1, 11)]
            expectedOutput = set([str(i) for i in testData])
            start_time = time.time()
            while time.time() - start_time < 120:
                kinesisTestUtils.pushData(testData)
                if expectedOutput == set(outputBuffer):
                    break
                time.sleep(10)
            self.assertEqual(expectedOutput, set(outputBuffer))
        except:
            import traceback
            traceback.print_exc()
            raise
        finally:
            self.ssc.stop(False)
            kinesisTestUtils.deleteStream()
            kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)


# Search jar in the project dir using the jar name_prefix for both sbt build and maven build because
# the artifact jars are in different directories.