Python pyspark.streaming.kafka.KafkaUtils.createStream() Examples

The following are 8 code examples of pyspark.streaming.kafka.KafkaUtils.createStream(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.streaming.kafka.KafkaUtils , or try the search function .
Example #1
Source File: bluecoat.py    From incubator-spot with Apache License 2.0 6 votes vote down vote up
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
    """
    Parse and save bluecoat logs.

    :param zk: Apache ZooKeeper quorum
    :param topic: Apache Kafka topic (application name)
    :param db: Apache Hive database to save into
    :param db_table: table of `db` to save into
    :param num_of_workers: number of Apache Kafka workers
    :param batch_size: batch size for Apache Spark streaming context
    """
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc, int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
    saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
    ssc.start()
    ssc.awaitTermination() 
Example #2
Source File: streaming.py    From incubator-spot with Apache License 2.0 5 votes vote down vote up
def __init__(self, ssc, zkQuorum, groupId, topics):
        from common.serializer       import deserialize
        from pyspark.streaming.kafka import KafkaUtils

        self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
                        keyDecoder=lambda x: x, valueDecoder=deserialize) 
Example #3
Source File: streaming.py    From incubator-spot with Apache License 2.0 5 votes vote down vote up
def __init__(self, ssc, zkQuorum, groupId, topics):
        from common.serializer       import deserialize
        from pyspark.streaming.kafka import KafkaUtils

        self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
                            keyDecoder=lambda x: x, valueDecoder=deserialize) 
Example #4
Source File: streaming.py    From incubator-spot with Apache License 2.0 5 votes vote down vote up
def __init__(self, ssc, zkQuorum, groupId, topics):
        from common.serializer       import deserialize
        from pyspark.streaming.kafka import KafkaUtils

        self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
                            keyDecoder=lambda x: x, valueDecoder=deserialize) 
Example #5
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_kafka_stream(self):
        """Test the Python Kafka stream API."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 5, "c": 10}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
                                         "test-streaming-consumer", {topic: 1},
                                         {"auto.offset.reset": "smallest"})
        self._validateStreamResult(sendData, stream) 
Example #6
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _startContext(self, n, compressed):
        # Start the StreamingContext and also collect the result
        dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(),
                                          enableDecompression=compressed)
        result = []

        def get_output(_, rdd):
            for event in rdd.collect():
                if len(result) < n:
                    result.append(event)
        dstream.foreachRDD(get_output)
        self.ssc.start()
        return result 
Example #7
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_kinesis_stream_api(self):
        # Don't start the StreamingContext because we cannot test it in Jenkins
        kinesisStream1 = KinesisUtils.createStream(
            self.ssc, "myAppNam", "mySparkStream",
            "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
            InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2)
        kinesisStream2 = KinesisUtils.createStream(
            self.ssc, "myAppNam", "mySparkStream",
            "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
            InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2,
            "awsAccessKey", "awsSecretKey") 
Example #8
Source File: tests.py    From LearningApacheSpark with MIT License 4 votes vote down vote up
def test_kinesis_stream(self):
        if not are_kinesis_tests_enabled:
            sys.stderr.write(
                "Skipped test_kinesis_stream (enable by setting environment variable %s=1"
                % kinesis_test_environ_var)
            return

        import random
        kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000)))
        kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(2)
        try:
            kinesisTestUtils.createStream()
            aWSCredentials = kinesisTestUtils.getAWSCredentials()
            stream = KinesisUtils.createStream(
                self.ssc, kinesisAppName, kinesisTestUtils.streamName(),
                kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(),
                InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY,
                aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey())

            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            stream.foreachRDD(get_output)
            self.ssc.start()

            testData = [i for i in range(1, 11)]
            expectedOutput = set([str(i) for i in testData])
            start_time = time.time()
            while time.time() - start_time < 120:
                kinesisTestUtils.pushData(testData)
                if expectedOutput == set(outputBuffer):
                    break
                time.sleep(10)
            self.assertEqual(expectedOutput, set(outputBuffer))
        except:
            import traceback
            traceback.print_exc()
            raise
        finally:
            self.ssc.stop(False)
            kinesisTestUtils.deleteStream()
            kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)


# Search jar in the project dir using the jar name_prefix for both sbt build and maven build because
# the artifact jars are in different directories.