Python pyspark.streaming.kafka.KafkaUtils.createDirectStream() Examples

The following are code examples for showing how to use pyspark.streaming.kafka.KafkaUtils.createDirectStream(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 6 votes vote down vote up
def test_kafka_direct_stream_foreach_get_offsetRanges(self):
        """Test the Python direct Kafka stream foreachRDD get offsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
                       "auto.offset.reset": "smallest"}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)

        offsetRanges = []

        def getOffsetRanges(_, rdd):
            for o in rdd.offsetRanges():
                offsetRanges.append(o)

        stream.foreachRDD(getOffsetRanges)
        self.ssc.start()
        self.wait_for(offsetRanges, 1)

        self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))]) 
Example 2
Project: sunbird-ml-workbench   Author: project-sunbird   File: io.py    MIT License 6 votes vote down vote up
def read(self, app_name, read_from_topic, groupID, offset_reset, session_timeout, auto_commit_enable, batch_duration):
        self.read_from_topic = read_from_topic
        self.groupID = groupID
        self.app_name = app_name
        self.offset_reset = offset_reset
        self.session_timeout = session_timeout
        self.auto_commit_enable = auto_commit_enable
        self.batch_duration = batch_duration
        
        self.kafkaParams = {"metadata.broker.list": self.kafka_broker}
        self.kafkaParams["auto.offset.reset"] = self.offset_reset
        self.kafkaParams["enable.auto.commit"] = self.auto_commit_enable
        
        if self.connection_established:
            if self.write_to_topic in self.server_topics.keys():
                sc = SparkContext(app_name=self.app_name)
                ssc = StreamingContext(sc, self.batch_duration)

                kvs = KafkaUtils.createDirectStream(ssc, [self.read_from_topic], self.kafkaParams)#{"metadata.broker.list": brokers})
                kvs.foreachRDD(handler)

                ssc.start()
                ssc.awaitTermination() 
Example 3
Project: kafka-compose   Author: ksindi   File: process.py    MIT License 6 votes vote down vote up
def create_context():
    spark = get_session(SPARK_CONF)
    ssc = StreamingContext(spark.sparkContext, BATCH_DURATION)
    ssc.checkpoint(CHECKPOINT)
    # start offsets from beginning
    # won't work if we have a chackpoint
    offsets = {TopicAndPartition(topic, 0): 0 for topic in TOPICS}
    stream = KafkaUtils.createDirectStream(ssc, TOPICS, KAFKA_PARAMS, offsets)
    main(stream)
    return ssc 
Example 4
Project: Graph-Embeddings-for-Recommender-Systems   Author: intellectape   File: twitterStream.py    GNU General Public License v3.0 5 votes vote down vote up
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
    ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore"))

    tweets.pprint()
    # Each element of tweets will be the text of a tweet.
    # We keep track of a running total counts and print it at every time step.
    words = tweets.flatMap(lambda line:line.split(" "))
    positive = words.map(lambda word: ('Positive', 1) if word in pwords else ('Positive', 0))
    negative = words.map(lambda word: ('Negative', 1) if word in nwords else ('Negative', 0))
    allSentiments = positive.union(negative)
    sentimentCounts = allSentiments.reduceByKey(lambda x,y: x+y)
    runningSentimentCounts = sentimentCounts.updateStateByKey(updateFunction)
    runningSentimentCounts.pprint()
    
    # The counts variable hold the word counts for all time steps
    counts = []
    sentimentCounts.foreachRDD(lambda t, rdd: counts.append(rdd.collect()))
    
    # Start the computation
    ssc.start() 
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully = True)

    return counts 
Example 5
Project: monasca-transform   Author: openstack   File: mon_metrics_kafka.py    Apache License 2.0 5 votes vote down vote up
def get_kafka_stream(topic, streaming_context):
        offset_specifications = simport.load(cfg.CONF.repositories.offsets)()
        app_name = streaming_context.sparkContext.appName
        saved_offset_spec = offset_specifications.get_kafka_offsets(app_name)
        if len(saved_offset_spec) < 1:

            MonMetricsKafkaProcessor.log_debug(
                "No saved offsets available..."
                "connecting to kafka without specifying offsets")
            kvs = KafkaUtils.createDirectStream(
                streaming_context, [topic],
                {"metadata.broker.list": cfg.CONF.messaging.brokers})

            return kvs

        else:
            from_offsets = {}
            for key, value in saved_offset_spec.items():
                if key.startswith("%s_%s" % (app_name, topic)):
                    # spec_app_name = value.get_app_name()
                    spec_topic = value.get_topic()
                    spec_partition = int(value.get_partition())
                    # spec_from_offset = value.get_from_offset()
                    spec_until_offset = value.get_until_offset()
                    # composite_key = "%s_%s_%s" % (spec_app_name,
                    #                               spec_topic,
                    #                               spec_partition)
                    # partition = saved_offset_spec[composite_key]
                    from_offsets[
                        TopicAndPartition(spec_topic, spec_partition)
                    ] = int(spec_until_offset)

            MonMetricsKafkaProcessor.log_debug(
                "get_kafka_stream: calling createDirectStream :"
                " topic:{%s} : start " % topic)
            for key, value in from_offsets.items():
                MonMetricsKafkaProcessor.log_debug(
                    "get_kafka_stream: calling createDirectStream : "
                    "offsets : TopicAndPartition:{%s,%s}, value:{%s}" %
                    (str(key._topic), str(key._partition), str(value)))
            MonMetricsKafkaProcessor.log_debug(
                "get_kafka_stream: calling createDirectStream : "
                "topic:{%s} : done" % topic)

            kvs = KafkaUtils.createDirectStream(
                streaming_context, [topic],
                {"metadata.broker.list": cfg.CONF.messaging.brokers},
                from_offsets)
            return kvs 
Example 6
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_kafka_direct_stream(self):
        """Test the Python direct Kafka stream API."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
                       "auto.offset.reset": "smallest"}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
        self._validateStreamResult(sendData, stream) 
Example 7
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_kafka_direct_stream_from_offset(self):
        """Test the Python direct Kafka stream API with start offset specified."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        fromOffsets = {TopicAndPartition(topic, 0): long(0)}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets)
        self._validateStreamResult(sendData, stream) 
Example 8
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_kafka_direct_stream_transform_get_offsetRanges(self):
        """Test the Python direct Kafka stream transform get offsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
                       "auto.offset.reset": "smallest"}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)

        offsetRanges = []

        def transformWithOffsetRanges(rdd):
            for o in rdd.offsetRanges():
                offsetRanges.append(o)
            return rdd

        # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together,
        # only the TransformedDstreams can be folded together.
        stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint()
        self.ssc.start()
        self.wait_for(offsetRanges, 1)

        self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))]) 
Example 9
Project: LearningApacheSpark   Author: runawayhorse001   File: tests.py    MIT License 5 votes vote down vote up
def test_kafka_direct_stream_transform_with_checkpoint(self):
        """Test the Python direct Kafka stream transform with checkpoint correctly recovered."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
                       "auto.offset.reset": "smallest"}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        offsetRanges = []

        def transformWithOffsetRanges(rdd):
            for o in rdd.offsetRanges():
                offsetRanges.append(o)
            return rdd

        self.ssc.stop(False)
        self.ssc = None
        tmpdir = "checkpoint-test-%d" % random.randint(0, 10000)

        def setup():
            ssc = StreamingContext(self.sc, 0.5)
            ssc.checkpoint(tmpdir)
            stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
            stream.transform(transformWithOffsetRanges).count().pprint()
            return ssc

        try:
            ssc1 = StreamingContext.getOrCreate(tmpdir, setup)
            ssc1.start()
            self.wait_for(offsetRanges, 1)
            self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])

            # To make sure some checkpoint is written
            time.sleep(3)
            ssc1.stop(False)
            ssc1 = None

            # Restart again to make sure the checkpoint is recovered correctly
            ssc2 = StreamingContext.getOrCreate(tmpdir, setup)
            ssc2.start()
            ssc2.awaitTermination(3)
            ssc2.stop(stopSparkContext=False, stopGraceFully=True)
            ssc2 = None
        finally:
            shutil.rmtree(tmpdir) 
Example 10
Project: Twitter-and-IMDB-Sentimental-Analytics   Author: abhinandanramesh   File: twitterStream.py    Apache License 2.0 5 votes vote down vote up
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).

    #Get the individual elements in the tweet    
    words = tweets.flatMap(lambda line: line.split(" "))
    
    #Filter the word to contain only positive and negative
    words = words.filter(lambda word: (word in pwords) or (word in nwords))

    #Map each filtered word to psoitive and negative
    wordPairs = words.map(lambda word: ('positive', 1) if (word in pwords) else ('negative', 1))

    #Sum up all positive and negative words in a tweet
    wordCounts = wordPairs.reduceByKey(lambda x, y: x + y)

    #Get an ensemble of total positive and negative counts and print them
    cumulativeCount = wordPairs.updateStateByKey(updateFunction)  
    cumulativeCount.pprint() 
    
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    wordCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts 
Example 11
Project: Big-Data   Author: Dukecat0613   File: pyspark_streaming.py    Apache License 2.0 5 votes vote down vote up
def creatStream(self):

        directKafkaStream=KafkaUtils.createDirectStream(self.ssc,[self.topic],{"metadata.broker.list":self.kafka_broker})
        return directKafkaStream 
Example 12
Project: ozymandias   Author: pambot   File: ozy_streaming.py    MIT License 5 votes vote down vote up
def main():
    """Run Spark Streaming"""
    conf = SparkConf()
    sc = SparkContext(appName='Ozymandias', conf=conf)
    sc.setLogLevel('WARN')
    
    with open(ROOT + 'channels.json', 'r') as f:
        channels = json.load(f)
    topics = [t['topic'] for t in channels['channels']]
    
    n_secs = 0.5
    ssc = StreamingContext(sc, n_secs)
    stream = KafkaUtils.createDirectStream(ssc, topics, {
                        'bootstrap.servers':'localhost:9092', 
                        'group.id':'ozy-group', 
                        'fetch.message.max.bytes':'15728640',
                        'auto.offset.reset':'largest'})
    
    stream.map(
            deserializer
        ).map(
            image_detector
        ).foreachRDD(
            message_sender)
    
    ssc.start()
    ssc.awaitTermination()