org.apache.spark.streaming.kafka.KafkaUtils Java Examples

The following examples show how to use org.apache.spark.streaming.kafka.KafkaUtils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #2
Source File: SparkStreaming.java    From kafka-spark-avro-example with Apache License 2.0 6 votes vote down vote up
private static void processStream(JavaStreamingContext ssc, JavaSparkContext sc) {
  System.out.println("--> Processing stream");

  Map<String, String> props = new HashMap<>();
  props.put("bootstrap.servers", "localhost:9092");
  props.put("schema.registry.url", "http://localhost:8081");
  props.put("group.id", "spark");
  props.put("specific.avro.reader", "true");

  props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer");
  props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

  Set<String> topicsSet = new HashSet<>(Collections.singletonList("test"));

  JavaPairInputDStream<String, Object> stream = KafkaUtils.createDirectStream(ssc, String.class, Object.class,
    StringDecoder.class, KafkaAvroDecoder.class, props, topicsSet);

  stream.foreachRDD(rdd -> {
    rdd.foreachPartition(iterator -> {
        while (iterator.hasNext()) {
          Tuple2<String, Object> next = iterator.next();
          Model model = (Model) next._2();
          System.out.println(next._1() + " --> " + model);
        }
      }
    );
  });
}
 
Example #3
Source File: StreamingContextConfiguration.java    From Decision with Apache License 2.0 5 votes vote down vote up
private void configureDataContext(JavaStreamingContext context) {
    Map<String, Integer> baseTopicMap = new HashMap<>();


    configurationContext.getDataTopics().forEach( dataTopic -> baseTopicMap.put(dataTopic, 1));

    kafkaTopicService.createTopicsIfNotExist(configurationContext.getDataTopics(), configurationContext
            .getKafkaReplicationFactor(), configurationContext.getKafkaPartitions());

    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath());
    kafkaParams.put("group.id", configurationContext.getGroupId());
     /*
     groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of
      the group.
     Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the
     same topic with the same groupId, only one instance will be able to read from the topic
     */
    JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class,
            kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap,
            StorageLevel.MEMORY_AND_DISK_SER());

    AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction();
    JavaDStream<StratioStreamingMessage>  insertRequests = messages.filter(
            new FilterAvroMessagesByOperationFunction(STREAM_OPERATIONS.MANIPULATION.INSERT))
            .map(avroDeserializeMessageFunction);

    InsertIntoStreamFunction insertIntoStreamFunction = new InsertIntoStreamFunction(streamOperationService,
            configurationContext.getZookeeperHostsQuorum());
    insertRequests.foreachRDD(insertIntoStreamFunction);

}
 
Example #4
Source File: KafkaStreamFactory.java    From zipkin-sparkstreaming with Apache License 2.0 5 votes vote down vote up
@Override public JavaDStream<byte[]> create(JavaStreamingContext jsc) {
  return KafkaUtils.createDirectStream(
      jsc,
      byte[].class,
      byte[].class,
      DefaultDecoder.class,
      DefaultDecoder.class,
      kafkaParams(),
      Collections.singleton(topic()))
      .map(m -> m._2); // get value
}
 
Example #5
Source File: KafkaReceiverWordCountJava.java    From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
    String zkQuorum = "localhost:2181";
    String groupName = "stream";
    int numThreads = 3;
    String topicsName = "test1";
    SparkConf sparkConf = new SparkConf().setAppName("WordCountKafkaStream");

    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, new Duration(5000));

    Map<String, Integer> topicToBeUsedBySpark = new HashMap<>();
    String[] topics = topicsName.split(",");
    for (String topic : topics) {
        topicToBeUsedBySpark.put(topic, numThreads);
    }

    JavaPairReceiverInputDStream<String, String> streamMessages =
            KafkaUtils.createStream(javaStreamingContext, zkQuorum, groupName, topicToBeUsedBySpark);

    JavaDStream<String> lines = streamMessages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
            return tuple2._2();
        }
    });

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Arrays.asList(WORD_DELIMETER.split(x)).iterator();
        }
    });

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
            new PairFunction<String, String, Integer>() {
                @Override
                public Tuple2<String, Integer> call(String s) {
                    return new Tuple2<>(s, 1);
                }
            }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    javaStreamingContext.start();
    javaStreamingContext.awaitTermination();
}
 
Example #6
Source File: StreamingContextConfiguration.java    From Decision with Apache License 2.0 4 votes vote down vote up
private void configureActionContext(JavaStreamingContext context) {
    Map<String, Integer> baseTopicMap = new HashMap<>();


    String topicName = InternalTopic.TOPIC_ACTION.getTopicName();
    if (configurationContext.isClusteringEnabled() && configurationContext.getGroupId()!=null){
        topicName = topicName.concat("_").concat(configurationContext.getGroupId());
    }
    baseTopicMap.put(topicName, 1);

    kafkaTopicService.createTopicIfNotExist(topicName, configurationContext.getKafkaReplicationFactor(),
            configurationContext.getKafkaPartitions());

    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath());
    kafkaParams.put("group.id", configurationContext.getGroupId());
    /*
    groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of
    the group.
    Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the
    same topic with the same groupId, only one instance will be able to read from the topic
    */
    JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class,
            kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap,
            StorageLevel.MEMORY_AND_DISK_SER());

    AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction();
    JavaDStream<StratioStreamingMessage>  parsedDataDstream = messages.map(avroDeserializeMessageFunction);

    JavaPairDStream<StreamAction, StratioStreamingMessage> pairedDataDstream = parsedDataDstream
            .mapPartitionsToPair(new PairDataFunction());

    JavaPairDStream<StreamAction, Iterable<StratioStreamingMessage>> groupedDataDstream = pairedDataDstream
            .groupByKey();

    groupedDataDstream.persist(StorageLevel.MEMORY_AND_DISK_SER());

    try {

        SaveToCassandraActionExecutionFunction saveToCassandraActionExecutionFunction = new SaveToCassandraActionExecutionFunction(configurationContext.getCassandraHostsQuorum(),
                configurationContext.getCassandraPort(), configurationContext.getCassandraMaxBatchSize(),
                configurationContext.getCassandraBatchType(), saveToCassandraOperationsService);
        if (saveToCassandraActionExecutionFunction.check()) {
            log.info("Cassandra is configured properly");
            groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_CASSANDRA)).foreachRDD(
                    saveToCassandraActionExecutionFunction);
        } else {
            log.warn("Cassandra is NOT configured properly");
        }

        SaveToMongoActionExecutionFunction saveToMongoActionExecutionFunction = new SaveToMongoActionExecutionFunction(configurationContext.getMongoHosts(),
                configurationContext.getMongoUsername(), configurationContext
                .getMongoPassword(), configurationContext.getMongoMaxBatchSize(), mongoClient, mongoDB);
        if (saveToMongoActionExecutionFunction.check()) {
            log.info("MongoDB is configured properly");
            groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_MONGO)).foreachRDD(
                    saveToMongoActionExecutionFunction);
        } else {
            log.warn("MongoDB is NOT configured properly");
        }

        SaveToElasticSearchActionExecutionFunction saveToElasticSearchActionExecutionFunction = new SaveToElasticSearchActionExecutionFunction(configurationContext.getElasticSearchHosts(),
                configurationContext.getElasticSearchClusterName(), configurationContext
                .getElasticSearchMaxBatchSize(), elasticsearchClient);
        if (saveToElasticSearchActionExecutionFunction.check()) {
            log.info("ElasticSearch is configured properly");
            groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_ELASTICSEARCH)).foreachRDD(saveToElasticSearchActionExecutionFunction);
        } else {
            log.warn("ElasticSearch is NOT configured properly");
        }

        SaveToSolrActionExecutionFunction saveToSolrActionExecutionFunction = new
                SaveToSolrActionExecutionFunction(configurationContext.getSolrHost(), configurationContext
                .getSolrCloudZkHost(),
                configurationContext.getSolrCloud(),
                configurationContext.getSolrDataDir(), configurationContext.getSolrMaxBatchSize(), solrOperationsService);
        if (saveToSolrActionExecutionFunction.check()) {
            log.info("Solr is configured properly");
            groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_SOLR)).foreachRDD(
                    saveToSolrActionExecutionFunction);
        } else {
            log.warn("Solr is NOT configured properly");
        }

        groupedDataDstream.filter(new FilterDataFunction(StreamAction.LISTEN)).foreachRDD(
                new SendToKafkaActionExecutionFunction(configurationContext.getKafkaHostsQuorum()));
    } catch (Exception e) {
        e.printStackTrace();
    }

}
 
Example #7
Source File: KafkaStreamRestHandler.java    From elasticsearch-rest-command with The Unlicense 4 votes vote down vote up
@Override
protected void handleRequest(RestRequest request, RestChannel channel, Client client)
		throws Exception {
	final String topic = request.param("topic", "");
	final boolean schema = request.paramAsBoolean("schema", false);
	final String master = request.param("masterAddress", "local");
	final String hdfs =  request.param("hdfs", "hdfs://localhost:50070");
	final String memory =  request.param("memory", "2g");
	final String appName = request.param("appName", "appName-"+topic);
	final int duration = request.paramAsInt("duration", 1000);
	
	Thread exec = new Thread(new Runnable(){

		@Override
		public void run() {
		
			SparkConf sparkConf = new SparkConf().setAppName(appName).setMaster(master).set("spark.executor.memory", memory);
			JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(duration));
			
			Map<String, Integer> topicMap = new HashMap<String, Integer>();
			topicMap.put(topic, 3);
			
			JavaPairReceiverInputDStream<String, byte[]> kafkaStream = KafkaUtils.createStream(jssc, String.class, byte[].class, 
						kafka.serializer.DefaultDecoder.class, kafka.serializer.DefaultDecoder.class, null, 
						topicMap,  StorageLevel.MEMORY_ONLY());
	
			//JobConf confHadoop = new JobConf();
			//confHadoop.set("mapred.output.compress", "true");
			//confHadoop.set("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzopCodec");
	
			kafkaStream.saveAsHadoopFiles(hdfs, "seq", Text.class, BytesWritable.class, KafkaStreamSeqOutputFormat.class);
			
			topicContextMap.put(topic, jssc);
			jssc.start();		
			jssc.awaitTermination();
			
		}
	});
	
	exec.start();
	
	channel.sendResponse(new BytesRestResponse(RestStatus.OK, String.format("{\"topic\":\"%s\"}",  topic)));
	
	
}
 
Example #8
Source File: IoTDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
 //read Spark and Cassandra properties and create SparkConf
 Properties prop = PropertyFileReader.readPropertyFile();		
 SparkConf conf = new SparkConf()
		 .setAppName(prop.getProperty("com.iot.app.spark.app.name"))
		 .setMaster(prop.getProperty("com.iot.app.spark.master"))
		 .set("spark.cassandra.connection.host", prop.getProperty("com.iot.app.cassandra.host"))
		 .set("spark.cassandra.connection.port", prop.getProperty("com.iot.app.cassandra.port"))
		 .set("spark.cassandra.connection.keep_alive_ms", prop.getProperty("com.iot.app.cassandra.keep_alive"));		 
 //batch interval of 5 seconds for incoming stream		 
 JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));	
 //add check point directory
 jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir"));
 
 //read and set Kafka properties
 Map<String, String> kafkaParams = new HashMap<String, String>();
 kafkaParams.put("zookeeper.connect", prop.getProperty("com.iot.app.kafka.zookeeper"));
 kafkaParams.put("metadata.broker.list", prop.getProperty("com.iot.app.kafka.brokerlist"));
 String topic = prop.getProperty("com.iot.app.kafka.topic");
 Set<String> topicsSet = new HashSet<String>();
 topicsSet.add(topic);
 //create direct kafka stream
 JavaPairInputDStream<String, IoTData> directKafkaStream = KafkaUtils.createDirectStream(
	        jssc,
	        String.class,
	        IoTData.class,
	        StringDecoder.class,
	        IoTDataDecoder.class,
	        kafkaParams,
	        topicsSet
	    );
 logger.info("Starting Stream Processing");
 
 //We need non filtered stream for poi traffic data calculation
 JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2());
 
 //We need filtered stream for total and traffic data calculation
 JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a );

 // Check vehicle Id is already processed
 JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData,Boolean>> iotDStreamWithStatePairs = iotDataPairStream
					.mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour

 // Filter processed vehicle ids and keep un-processed
 JavaDStream<Tuple2<IoTData,Boolean>> filteredIotDStreams = iotDStreamWithStatePairs.map(tuple2 -> tuple2)
					.filter(tuple -> tuple._2.equals(Boolean.FALSE));

 // Get stream of IoTdata
 JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1);
 
 //cache stream as it is used in total and window based computation
 filteredIotDataStream.cache();
 	 
 //process data
 IoTTrafficDataProcessor iotTrafficProcessor = new IoTTrafficDataProcessor();
 iotTrafficProcessor.processTotalTrafficData(filteredIotDataStream);
 iotTrafficProcessor.processWindowTrafficData(filteredIotDataStream);

 //poi data
 POIData poiData = new POIData();
 poiData.setLatitude(33.877495);
 poiData.setLongitude(-95.50238);
 poiData.setRadius(30);//30 km
 
 //broadcast variables. We will monitor vehicles on Route 37 which are of type Truck
 Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues = jssc.sparkContext().broadcast(new Tuple3<>(poiData,"Route-37","Truck"));
 //call method  to process stream
 iotTrafficProcessor.processPOIData(nonFilteredIotDataStream,broadcastPOIValues);
 
 //start context
 jssc.start();            
 jssc.awaitTermination();  
}
 
Example #9
Source File: AppMain.java    From SparkToParquet with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {
	Flags.setFromCommandLineArgs(THE_OPTIONS, args);

	// 初始化Spark Conf.
	SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
	JavaSparkContext sc = new JavaSparkContext(conf);
	JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
	SQLContext sqlContext = new SQLContext(sc);

	// 初始化参数
	HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
	HashMap<String, String> kafkaParams = new HashMap<String, String>();
	kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());

	// 从Kafka Stream获取数据
	JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
			StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		private static final long serialVersionUID = 5266880065425088203L;

		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
		List<ApacheAccessLog> list = new ArrayList<>();
		try {
			// 映射每一行
			list.add(ApacheAccessLog.parseFromLogLine(line));
			return list;
		} catch (RuntimeException e) {
			return list;
		}
	}).cache();

	accessLogsDStream.foreachRDD(rdd -> {

		// rdd to DataFrame
		DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
		// 写入Parquet文件
		df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

		return null;
	});

	// 启动Streaming服务器
	jssc.start(); // 启动计算
	jssc.awaitTermination(); // 等待终止
}
 
Example #10
Source File: JavaDirectKafkaWordCount.java    From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {

        String brokers = "localhost:9092";
        String topics = "test1";

        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("DirectKafkaWordCount");
        JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(2));
        Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
        Map<String, String> kafkaConfiguration = new HashMap<>();
        kafkaConfiguration.put("metadata.broker.list", brokers);
        kafkaConfiguration.put("group.id", "stream_test8");
        kafkaConfiguration.put("auto.offset.reset", "smallest");

        JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
                javaStreamingContext,
                String.class,
                String.class,
                StringDecoder.class,
                StringDecoder.class,
                kafkaConfiguration,
                topicsSet
        );

        JavaDStream<String> lines = messages.map(Tuple2::_2);

        JavaDStream<String> words = lines.flatMap(
                x ->
                        Arrays.asList(SPACE.split(x)
                ).iterator());

        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(

                s -> new Tuple2<>(s, 1)

        )
                .reduceByKey((i1, i2) -> i1 + i2);

        //wordCounts.dstream().saveAsTextFiles("hdfs://10.200.99.197:8020/user/chanchal.singh/wordCounts", "result");
        wordCounts.print();
        javaStreamingContext.start();
        javaStreamingContext.awaitTermination();
    }
 
Example #11
Source File: FraudDetectionApp.java    From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {

        String brokers = "localhost:9092";
        String topics = "iplog";
        CacheIPLookup cacheIPLookup = new CacheIPLookup();
        SparkConf sparkConf = new SparkConf().setAppName("IP_FRAUD");
        JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(2));

        Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
        Map<String, String> kafkaConfiguration = new HashMap<>();
        kafkaConfiguration.put("metadata.broker.list", brokers);
        kafkaConfiguration.put("group.id", "ipfraud");
        kafkaConfiguration.put("auto.offset.reset", "smallest");

        JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
                javaStreamingContext,
                String.class,
                String.class,
                StringDecoder.class,
                StringDecoder.class,
                kafkaConfiguration,
                topicsSet
        );

        JavaDStream<String> lines = messages.map(Tuple2::_2);

        JavaDStream<String> fraudIPs = lines.filter(new Function<String, Boolean>() {
            @Override
            public Boolean call(String s) throws Exception {
                String IP = s.split(" ")[0];
                String[] ranges = IP.split("\\.");
                String range = null;
                try {
                    range = ranges[0] + "." + ranges[1];
                } catch (ArrayIndexOutOfBoundsException ex) {

                }
                return cacheIPLookup.isFraudIP(range);

            }
        });

        DStream<String> fraudDstream = fraudIPs.dstream();
        fraudDstream.saveAsTextFiles("FraudRecord", "");

        javaStreamingContext.start();
        javaStreamingContext.awaitTermination();
    }
 
Example #12
Source File: AdClickRealTimeStatSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
        // 构建Spark Streaming上下文
        SparkConf conf = new SparkConf()
                .setMaster("local[2]")
                .setAppName("AdClickRealTimeStatSpark");
//				.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
//				.set("spark.default.parallelism", "1000");
//				.set("spark.streaming.blockInterval", "50");
//				.set("spark.streaming.receiver.writeAheadLog.enable", "true");

        // spark streaming的上下文是构建JavaStreamingContext对象
        // 而不是像之前的JavaSparkContext、SQLContext/HiveContext
        // 传入的第一个参数,和之前的spark上下文一样,也是SparkConf对象;第二个参数则不太一样

        // 第二个参数是spark streaming类型作业比较有特色的一个参数
        // 实时处理batch的interval
        // spark streaming,每隔一小段时间,会去收集一次数据源(kafka)中的数据,做成一个batch
        // 每次都是处理一个batch中的数据

        // 通常来说,batch interval,就是指每隔多少时间收集一次数据源中的数据,然后进行处理
        // 一遍spark streaming的应用,都是设置数秒到数十秒(很少会超过1分钟)

        // 咱们这里项目中,就设置5秒钟的batch interval
        // 每隔5秒钟,咱们的spark streaming作业就会收集最近5秒内的数据源接收过来的数据
        JavaStreamingContext jssc = new JavaStreamingContext(
                conf, Durations.seconds(5));
        jssc.checkpoint("hdfs://120.77.155.220:9000/streaming_checkpoint");

        // 正式开始进行代码的编写
        // 实现咱们需要的实时计算的业务逻辑和功能

        // 创建针对Kafka数据来源的输入DStream(离线流,代表了一个源源不断的数据来源,抽象)
        // 选用kafka direct api(很多好处,包括自己内部自适应调整每次接收数据量的特性,等等)

        // 构建kafka参数map
        // 主要要放置的就是,你要连接的kafka集群的地址(broker集群的地址列表)
        Map<String, String> kafkaParams = new HashMap<String, String>();
        kafkaParams.put("metadata.broker.list",
                ConfigurationManager.getProperty(Constants.KAFKA_METADATA_BROKER_LIST));

        // 构建topic set
        String kafkaTopics = ConfigurationManager.getProperty(Constants.KAFKA_TOPICS);
        String[] kafkaTopicsSplited = kafkaTopics.split(",");

        Set<String> topics = new HashSet<String>();
        for (String kafkaTopic : kafkaTopicsSplited) {
            topics.add(kafkaTopic);
        }
        // 基于kafka direct api模式,构建出了针对kafka集群中指定topic的输入DStream
        // 两个值,val1,val2;val1没有什么特殊的意义;val2中包含了kafka topic中的一条一条的实时日志数据
        JavaPairInputDStream<String, String> adRealTimeLogDStream = KafkaUtils.createDirectStream(
                jssc,
                String.class,
                String.class,
                StringDecoder.class,
                StringDecoder.class,
                kafkaParams,
                topics);

//		adRealTimeLogDStream.repartition(1000);

        // 根据动态黑名单进行数据过滤
        JavaPairDStream<String, String> filteredAdRealTimeLogDStream =
                filterByBlacklist(adRealTimeLogDStream);

        // 生成动态黑名单
        generateDynamicBlacklist(filteredAdRealTimeLogDStream);

        // 业务功能一:计算广告点击流量实时统计结果(yyyyMMdd_province_city_adid,clickCount)
        // 最粗
        JavaPairDStream<String, Long> adRealTimeStatDStream = calculateRealTimeStat(
                filteredAdRealTimeLogDStream);

        // 业务功能二:实时统计每天每个省份top3热门广告
        // 统计的稍微细一些了
        calculateProvinceTop3Ad(adRealTimeStatDStream);

        // 业务功能三:实时统计每天每个广告在最近1小时的滑动窗口内的点击趋势(每分钟的点击量)
        // 统计的非常细了
        // 我们每次都可以看到每个广告,最近一小时内,每分钟的点击量
        // 每支广告的点击趋势
        calculateAdClickCountByWindow(adRealTimeLogDStream);

        // 构建完spark streaming上下文之后,记得要进行上下文的启动、等待执行结束、关闭
        jssc.start();
        jssc.awaitTermination();
        jssc.close();
    }
 
Example #13
Source File: JavaKafkaDirectWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
/**
 * 1.一对一
 * 2.高效
 * 3.准确的只计算一次
 *
 * @param args
 */
public static void main(String[] args) {
    StreamingExamples.setStreamingLogLevels();
    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaDirectWordCount").setMaster("local[1]");
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(6));

    Map<String, String> kafkaParams = new HashMap<String, String>(); // key是topic名称,value是线程数量
    kafkaParams.put("metadata.broker.list", "master:9092,slave1:9092,slave2:9092"); // 指定broker在哪
    HashSet<String> topicsSet = new HashSet<String>();
    topicsSet.add("2017-7-26"); // 指定操作的topic

    // Create direct kafka stream with brokers and topics createDirectStream()
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet
    );

    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
            return tuple2._2();
        }
    });

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Lists.newArrayList(SPACE.split(x)).iterator();
        }
    });

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example #14
Source File: JavaKafkaReceiverWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	StreamingExamples.setStreamingLogLevels();
	SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaReceiverWordCount").setMaster("local[4]");
	JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(6));

	Map<String, Integer> topicMap = new HashMap<String, Integer>(); // key是topic名称,value是线程数量
	topicMap.put("2017-7-26", 1);

	String zookeeperList = "master:2181,slave1:2181,slave2:2181";

		JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, zookeeperList,
			"JavaKafkaReceiverWordCount", topicMap);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		@Override
		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String x) {
			return Lists.newArrayList(SPACE.split(x)).iterator();
		}
	});

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
		@Override
		public Tuple2<String, Integer> call(String s) {
			return new Tuple2<String, Integer>(s, 1);
		}
	}).reduceByKey(new Function2<Integer, Integer, Integer>() {
		@Override
		public Integer call(Integer i1, Integer i2) {
			return i1 + i2;
		}
	});

	wordCounts.print();
	jssc.start();

	try {
		jssc.awaitTermination();
	} catch (Exception e) {
		e.printStackTrace();
	}
}
 
Example #15
Source File: JavaKafkaWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 4) {
    System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();
  SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
  // Create the context with 2 seconds batch size
  JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));

  int numThreads = Integer.parseInt(args[3]);
  Map<String, Integer> topicMap = new HashMap<>();
  String[] topics = args[2].split(",");
  for (String topic: topics) {
    topicMap.put(topic, numThreads);
  }

  JavaPairReceiverInputDStream<String, String> messages =
          KafkaUtils.createStream(jssc, args[0], args[1], topicMap);

  JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
    @Override
    public String call(Tuple2<String, String> tuple2) {
      return tuple2._2();
    }
  });

  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });

  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override
      public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  jssc.start();
  jssc.awaitTermination();
}
 
Example #16
Source File: JavaDirectKafkaWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" +
        "  <brokers> is a list of one or more Kafka brokers\n" +
        "  <topics> is a list of one or more kafka topics to consume from\n\n");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  String brokers = args[0];
  String topics = args[1];

  // Create context with a 2 seconds batch interval
  SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount");
  JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));

  Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
  Map<String, String> kafkaParams = new HashMap<>();
  kafkaParams.put("metadata.broker.list", brokers);

  // Create direct kafka stream with brokers and topics
  JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
      jssc,
      String.class,
      String.class,
      StringDecoder.class,
      StringDecoder.class,
      kafkaParams,
      topicsSet
  );

  // Get the lines, split them into words, count the words and print
  JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
    @Override
    public String call(Tuple2<String, String> tuple2) {
      return tuple2._2();
    }
  });
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override
      public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(
      new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });
  wordCounts.print();

  // Start the computation
  jssc.start();
  jssc.awaitTermination();
}
 
Example #17
Source File: KafkaSource08.java    From sylph with Apache License 2.0 4 votes vote down vote up
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig08 config, SourceContext context)
{
    String topics = requireNonNull(config.getTopics(), "topics not setting");
    String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器
    String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字
    String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting");

    Map<String, String> otherConfig = config.getOtherConfig().entrySet()
            .stream()
            .filter(x -> x.getValue() != null)
            .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString()));

    Map<String, String> kafkaParams = new HashMap<>(otherConfig);
    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
    //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest   smallest

    //----get fromOffsets
    @SuppressWarnings("unchecked")
    scala.collection.immutable.Map<String, String> map = (scala.collection.immutable.Map<String, String>) Map$.MODULE$.apply(JavaConverters.mapAsScalaMapConverter(kafkaParams).asScala().toSeq());
    final KafkaCluster kafkaCluster = new KafkaCluster(map);
    Map<TopicAndPartition, Long> fromOffsets = getFromOffset(kafkaCluster, topics, groupId);

    //--- createDirectStream  DirectKafkaInputDStream.class
    org.apache.spark.api.java.function.Function<MessageAndMetadata<byte[], byte[]>, ConsumerRecord<byte[], byte[]>> messageHandler =
            mmd -> new ConsumerRecord<>(mmd.topic(), mmd.partition(), mmd.key(), mmd.message(), mmd.offset());
    @SuppressWarnings("unchecked")
    Class<ConsumerRecord<byte[], byte[]>> recordClass = (Class<ConsumerRecord<byte[], byte[]>>) ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class).runtimeClass();
    JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(ssc,
            byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class, recordClass,
            kafkaParams, fromOffsets,
            messageHandler
    );
    JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = settingCommit(inputStream, kafkaParams, kafkaCluster, groupId);

    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return dStream
                .map(record -> {
                    return jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset());
                });
    }
    else {
        StructType structType = schemaToSparkType(context.getSchema());
        return dStream
                .map(record -> {
                    String[] names = structType.names();
                    Object[] values = new Object[names.length];
                    for (int i = 0; i < names.length; i++) {
                        switch (names[i]) {
                            case "_topic":
                                values[i] = record.topic();
                                continue;
                            case "_message":
                                values[i] = new String(record.value(), UTF_8);
                                continue;
                            case "_key":
                                values[i] = new String(record.key(), UTF_8);
                                continue;
                            case "_partition":
                                values[i] = record.partition();
                                continue;
                            case "_offset":
                                values[i] = record.offset();
                            default:
                                values[i] = null;
                        }
                    }
                    return (Row) new GenericRowWithSchema(values, structType);
                });  //.window(Duration(10 * 1000))
    }
}
 
Example #18
Source File: SparkKafkaTest.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
    if (args.length < 4) {
        System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setMaster("local[2]")
            .setAppName("JavaKafkaWordCount");


    // 咱们这里项目中,就设置5秒钟的batch interval
    // 每隔5秒钟,咱们的spark streaming作业就会收集最近5秒内的数据源接收过来的数据
    JavaStreamingContext jssc = new JavaStreamingContext(
            conf, Durations.seconds(5));
    jssc.checkpoint("hdfs://Master:9000/streaming_checkpoint");

    // 正式开始进行代码的编写
    // 实现咱们需要的实时计算的业务逻辑和功能

    // 创建针对Kafka数据来源的输入DStream(离线流,代表了一个源源不断的数据来源,抽象)
    // 选用kafka direct api(很多好处,包括自己内部自适应调整每次接收数据量的特性,等等)

    // 构建kafka参数map
    // 主要要放置的就是,你要连接的kafka集群的地址(broker集群的地址列表)
    Map<String, String> kafkaParams = new HashMap<String, String>();
    kafkaParams.put("metadata.broker.list",
            ConfigurationManager.getProperty(Constants.KAFKA_METADATA_BROKER_LIST));

    // 构建topic set
    String kafkaTopics = "streamingtopic";//ConfigurationManager.getProperty("streamingtopic");
    String[] kafkaTopicsSplited = kafkaTopics.split(",");

    Set<String> topics = new HashSet<String>();
    for (String kafkaTopic : kafkaTopicsSplited) {
        topics.add(kafkaTopic);
    }
    // 基于kafka direct api模式,构建出了针对kafka集群中指定topic的输入DStream
    // 两个值,val1,val2;val1没有什么特殊的意义;val2中包含了kafka topic中的一条一条的实时日志数据
    JavaPairInputDStream<String, String> adRealTimeLogDStream = KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topics);

    JavaDStream<String> lines = adRealTimeLogDStream.map(Tuple2::_2);
    JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator());
    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((i1, i2) -> i1 + i2);
    wordCounts.print();
    jssc.start();
    jssc.awaitTermination();
}