org.apache.spark.streaming.kafka.KafkaUtils Java Examples
The following examples show how to use
org.apache.spark.streaming.kafka.KafkaUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0 | 8 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000)); //设置检查点 streamingContext.checkpoint("HDFS URL"); Map<String, Integer> topicThread = new HashMap<>(1); topicThread.put(TOPIC, THREAD); JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread); JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator()); //统计 JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2); try { result.print(); streamingContext.start(); streamingContext.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #2
Source File: SparkStreaming.java From kafka-spark-avro-example with Apache License 2.0 | 6 votes |
private static void processStream(JavaStreamingContext ssc, JavaSparkContext sc) { System.out.println("--> Processing stream"); Map<String, String> props = new HashMap<>(); props.put("bootstrap.servers", "localhost:9092"); props.put("schema.registry.url", "http://localhost:8081"); props.put("group.id", "spark"); props.put("specific.avro.reader", "true"); props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer"); props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); Set<String> topicsSet = new HashSet<>(Collections.singletonList("test")); JavaPairInputDStream<String, Object> stream = KafkaUtils.createDirectStream(ssc, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, props, topicsSet); stream.foreachRDD(rdd -> { rdd.foreachPartition(iterator -> { while (iterator.hasNext()) { Tuple2<String, Object> next = iterator.next(); Model model = (Model) next._2(); System.out.println(next._1() + " --> " + model); } } ); }); }
Example #3
Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0 | 5 votes |
private void configureDataContext(JavaStreamingContext context) { Map<String, Integer> baseTopicMap = new HashMap<>(); configurationContext.getDataTopics().forEach( dataTopic -> baseTopicMap.put(dataTopic, 1)); kafkaTopicService.createTopicsIfNotExist(configurationContext.getDataTopics(), configurationContext .getKafkaReplicationFactor(), configurationContext.getKafkaPartitions()); HashMap<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath()); kafkaParams.put("group.id", configurationContext.getGroupId()); /* groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of the group. Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the same topic with the same groupId, only one instance will be able to read from the topic */ JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class, kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap, StorageLevel.MEMORY_AND_DISK_SER()); AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction(); JavaDStream<StratioStreamingMessage> insertRequests = messages.filter( new FilterAvroMessagesByOperationFunction(STREAM_OPERATIONS.MANIPULATION.INSERT)) .map(avroDeserializeMessageFunction); InsertIntoStreamFunction insertIntoStreamFunction = new InsertIntoStreamFunction(streamOperationService, configurationContext.getZookeeperHostsQuorum()); insertRequests.foreachRDD(insertIntoStreamFunction); }
Example #4
Source File: KafkaStreamFactory.java From zipkin-sparkstreaming with Apache License 2.0 | 5 votes |
@Override public JavaDStream<byte[]> create(JavaStreamingContext jsc) { return KafkaUtils.createDirectStream( jsc, byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class, kafkaParams(), Collections.singleton(topic())) .map(m -> m._2); // get value }
Example #5
Source File: KafkaReceiverWordCountJava.java From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License | 4 votes |
public static void main(String[] args) throws Exception { String zkQuorum = "localhost:2181"; String groupName = "stream"; int numThreads = 3; String topicsName = "test1"; SparkConf sparkConf = new SparkConf().setAppName("WordCountKafkaStream"); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, new Duration(5000)); Map<String, Integer> topicToBeUsedBySpark = new HashMap<>(); String[] topics = topicsName.split(","); for (String topic : topics) { topicToBeUsedBySpark.put(topic, numThreads); } JavaPairReceiverInputDStream<String, String> streamMessages = KafkaUtils.createStream(javaStreamingContext, zkQuorum, groupName, topicToBeUsedBySpark); JavaDStream<String> lines = streamMessages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(WORD_DELIMETER.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); javaStreamingContext.start(); javaStreamingContext.awaitTermination(); }
Example #6
Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0 | 4 votes |
private void configureActionContext(JavaStreamingContext context) { Map<String, Integer> baseTopicMap = new HashMap<>(); String topicName = InternalTopic.TOPIC_ACTION.getTopicName(); if (configurationContext.isClusteringEnabled() && configurationContext.getGroupId()!=null){ topicName = topicName.concat("_").concat(configurationContext.getGroupId()); } baseTopicMap.put(topicName, 1); kafkaTopicService.createTopicIfNotExist(topicName, configurationContext.getKafkaReplicationFactor(), configurationContext.getKafkaPartitions()); HashMap<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath()); kafkaParams.put("group.id", configurationContext.getGroupId()); /* groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of the group. Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the same topic with the same groupId, only one instance will be able to read from the topic */ JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class, kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap, StorageLevel.MEMORY_AND_DISK_SER()); AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction(); JavaDStream<StratioStreamingMessage> parsedDataDstream = messages.map(avroDeserializeMessageFunction); JavaPairDStream<StreamAction, StratioStreamingMessage> pairedDataDstream = parsedDataDstream .mapPartitionsToPair(new PairDataFunction()); JavaPairDStream<StreamAction, Iterable<StratioStreamingMessage>> groupedDataDstream = pairedDataDstream .groupByKey(); groupedDataDstream.persist(StorageLevel.MEMORY_AND_DISK_SER()); try { SaveToCassandraActionExecutionFunction saveToCassandraActionExecutionFunction = new SaveToCassandraActionExecutionFunction(configurationContext.getCassandraHostsQuorum(), configurationContext.getCassandraPort(), configurationContext.getCassandraMaxBatchSize(), configurationContext.getCassandraBatchType(), saveToCassandraOperationsService); if (saveToCassandraActionExecutionFunction.check()) { log.info("Cassandra is configured properly"); groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_CASSANDRA)).foreachRDD( saveToCassandraActionExecutionFunction); } else { log.warn("Cassandra is NOT configured properly"); } SaveToMongoActionExecutionFunction saveToMongoActionExecutionFunction = new SaveToMongoActionExecutionFunction(configurationContext.getMongoHosts(), configurationContext.getMongoUsername(), configurationContext .getMongoPassword(), configurationContext.getMongoMaxBatchSize(), mongoClient, mongoDB); if (saveToMongoActionExecutionFunction.check()) { log.info("MongoDB is configured properly"); groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_MONGO)).foreachRDD( saveToMongoActionExecutionFunction); } else { log.warn("MongoDB is NOT configured properly"); } SaveToElasticSearchActionExecutionFunction saveToElasticSearchActionExecutionFunction = new SaveToElasticSearchActionExecutionFunction(configurationContext.getElasticSearchHosts(), configurationContext.getElasticSearchClusterName(), configurationContext .getElasticSearchMaxBatchSize(), elasticsearchClient); if (saveToElasticSearchActionExecutionFunction.check()) { log.info("ElasticSearch is configured properly"); groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_ELASTICSEARCH)).foreachRDD(saveToElasticSearchActionExecutionFunction); } else { log.warn("ElasticSearch is NOT configured properly"); } SaveToSolrActionExecutionFunction saveToSolrActionExecutionFunction = new SaveToSolrActionExecutionFunction(configurationContext.getSolrHost(), configurationContext .getSolrCloudZkHost(), configurationContext.getSolrCloud(), configurationContext.getSolrDataDir(), configurationContext.getSolrMaxBatchSize(), solrOperationsService); if (saveToSolrActionExecutionFunction.check()) { log.info("Solr is configured properly"); groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_SOLR)).foreachRDD( saveToSolrActionExecutionFunction); } else { log.warn("Solr is NOT configured properly"); } groupedDataDstream.filter(new FilterDataFunction(StreamAction.LISTEN)).foreachRDD( new SendToKafkaActionExecutionFunction(configurationContext.getKafkaHostsQuorum())); } catch (Exception e) { e.printStackTrace(); } }
Example #7
Source File: KafkaStreamRestHandler.java From elasticsearch-rest-command with The Unlicense | 4 votes |
@Override protected void handleRequest(RestRequest request, RestChannel channel, Client client) throws Exception { final String topic = request.param("topic", ""); final boolean schema = request.paramAsBoolean("schema", false); final String master = request.param("masterAddress", "local"); final String hdfs = request.param("hdfs", "hdfs://localhost:50070"); final String memory = request.param("memory", "2g"); final String appName = request.param("appName", "appName-"+topic); final int duration = request.paramAsInt("duration", 1000); Thread exec = new Thread(new Runnable(){ @Override public void run() { SparkConf sparkConf = new SparkConf().setAppName(appName).setMaster(master).set("spark.executor.memory", memory); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(duration)); Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put(topic, 3); JavaPairReceiverInputDStream<String, byte[]> kafkaStream = KafkaUtils.createStream(jssc, String.class, byte[].class, kafka.serializer.DefaultDecoder.class, kafka.serializer.DefaultDecoder.class, null, topicMap, StorageLevel.MEMORY_ONLY()); //JobConf confHadoop = new JobConf(); //confHadoop.set("mapred.output.compress", "true"); //confHadoop.set("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzopCodec"); kafkaStream.saveAsHadoopFiles(hdfs, "seq", Text.class, BytesWritable.class, KafkaStreamSeqOutputFormat.class); topicContextMap.put(topic, jssc); jssc.start(); jssc.awaitTermination(); } }); exec.start(); channel.sendResponse(new BytesRestResponse(RestStatus.OK, String.format("{\"topic\":\"%s\"}", topic))); }
Example #8
Source File: IoTDataProcessor.java From iot-traffic-monitor with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { //read Spark and Cassandra properties and create SparkConf Properties prop = PropertyFileReader.readPropertyFile(); SparkConf conf = new SparkConf() .setAppName(prop.getProperty("com.iot.app.spark.app.name")) .setMaster(prop.getProperty("com.iot.app.spark.master")) .set("spark.cassandra.connection.host", prop.getProperty("com.iot.app.cassandra.host")) .set("spark.cassandra.connection.port", prop.getProperty("com.iot.app.cassandra.port")) .set("spark.cassandra.connection.keep_alive_ms", prop.getProperty("com.iot.app.cassandra.keep_alive")); //batch interval of 5 seconds for incoming stream JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5)); //add check point directory jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir")); //read and set Kafka properties Map<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("zookeeper.connect", prop.getProperty("com.iot.app.kafka.zookeeper")); kafkaParams.put("metadata.broker.list", prop.getProperty("com.iot.app.kafka.brokerlist")); String topic = prop.getProperty("com.iot.app.kafka.topic"); Set<String> topicsSet = new HashSet<String>(); topicsSet.add(topic); //create direct kafka stream JavaPairInputDStream<String, IoTData> directKafkaStream = KafkaUtils.createDirectStream( jssc, String.class, IoTData.class, StringDecoder.class, IoTDataDecoder.class, kafkaParams, topicsSet ); logger.info("Starting Stream Processing"); //We need non filtered stream for poi traffic data calculation JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2()); //We need filtered stream for total and traffic data calculation JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a ); // Check vehicle Id is already processed JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData,Boolean>> iotDStreamWithStatePairs = iotDataPairStream .mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour // Filter processed vehicle ids and keep un-processed JavaDStream<Tuple2<IoTData,Boolean>> filteredIotDStreams = iotDStreamWithStatePairs.map(tuple2 -> tuple2) .filter(tuple -> tuple._2.equals(Boolean.FALSE)); // Get stream of IoTdata JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1); //cache stream as it is used in total and window based computation filteredIotDataStream.cache(); //process data IoTTrafficDataProcessor iotTrafficProcessor = new IoTTrafficDataProcessor(); iotTrafficProcessor.processTotalTrafficData(filteredIotDataStream); iotTrafficProcessor.processWindowTrafficData(filteredIotDataStream); //poi data POIData poiData = new POIData(); poiData.setLatitude(33.877495); poiData.setLongitude(-95.50238); poiData.setRadius(30);//30 km //broadcast variables. We will monitor vehicles on Route 37 which are of type Truck Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues = jssc.sparkContext().broadcast(new Tuple3<>(poiData,"Route-37","Truck")); //call method to process stream iotTrafficProcessor.processPOIData(nonFilteredIotDataStream,broadcastPOIValues); //start context jssc.start(); jssc.awaitTermination(); }
Example #9
Source File: AppMain.java From SparkToParquet with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { Flags.setFromCommandLineArgs(THE_OPTIONS, args); // 初始化Spark Conf. SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark"); JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval()); SQLContext sqlContext = new SQLContext(sc); // 初始化参数 HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(","))); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker()); // 从Kafka Stream获取数据 JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { private static final long serialVersionUID = 5266880065425088203L; public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> { List<ApacheAccessLog> list = new ArrayList<>(); try { // 映射每一行 list.add(ApacheAccessLog.parseFromLogLine(line)); return list; } catch (RuntimeException e) { return list; } }).cache(); accessLogsDStream.foreachRDD(rdd -> { // rdd to DataFrame DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class); // 写入Parquet文件 df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile()); return null; }); // 启动Streaming服务器 jssc.start(); // 启动计算 jssc.awaitTermination(); // 等待终止 }
Example #10
Source File: JavaDirectKafkaWordCount.java From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License | 4 votes |
public static void main(String[] args) throws Exception { String brokers = "localhost:9092"; String topics = "test1"; SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("DirectKafkaWordCount"); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(2)); Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaConfiguration = new HashMap<>(); kafkaConfiguration.put("metadata.broker.list", brokers); kafkaConfiguration.put("group.id", "stream_test8"); kafkaConfiguration.put("auto.offset.reset", "smallest"); JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( javaStreamingContext, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaConfiguration, topicsSet ); JavaDStream<String> lines = messages.map(Tuple2::_2); JavaDStream<String> words = lines.flatMap( x -> Arrays.asList(SPACE.split(x) ).iterator()); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( s -> new Tuple2<>(s, 1) ) .reduceByKey((i1, i2) -> i1 + i2); //wordCounts.dstream().saveAsTextFiles("hdfs://10.200.99.197:8020/user/chanchal.singh/wordCounts", "result"); wordCounts.print(); javaStreamingContext.start(); javaStreamingContext.awaitTermination(); }
Example #11
Source File: FraudDetectionApp.java From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License | 4 votes |
public static void main(String[] args) throws Exception { String brokers = "localhost:9092"; String topics = "iplog"; CacheIPLookup cacheIPLookup = new CacheIPLookup(); SparkConf sparkConf = new SparkConf().setAppName("IP_FRAUD"); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(2)); Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaConfiguration = new HashMap<>(); kafkaConfiguration.put("metadata.broker.list", brokers); kafkaConfiguration.put("group.id", "ipfraud"); kafkaConfiguration.put("auto.offset.reset", "smallest"); JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( javaStreamingContext, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaConfiguration, topicsSet ); JavaDStream<String> lines = messages.map(Tuple2::_2); JavaDStream<String> fraudIPs = lines.filter(new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { String IP = s.split(" ")[0]; String[] ranges = IP.split("\\."); String range = null; try { range = ranges[0] + "." + ranges[1]; } catch (ArrayIndexOutOfBoundsException ex) { } return cacheIPLookup.isFraudIP(range); } }); DStream<String> fraudDstream = fraudIPs.dstream(); fraudDstream.saveAsTextFiles("FraudRecord", ""); javaStreamingContext.start(); javaStreamingContext.awaitTermination(); }
Example #12
Source File: AdClickRealTimeStatSpark.java From BigDataPlatform with GNU General Public License v3.0 | 4 votes |
public static void main(String[] args) throws InterruptedException { // 构建Spark Streaming上下文 SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("AdClickRealTimeStatSpark"); // .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); // .set("spark.default.parallelism", "1000"); // .set("spark.streaming.blockInterval", "50"); // .set("spark.streaming.receiver.writeAheadLog.enable", "true"); // spark streaming的上下文是构建JavaStreamingContext对象 // 而不是像之前的JavaSparkContext、SQLContext/HiveContext // 传入的第一个参数,和之前的spark上下文一样,也是SparkConf对象;第二个参数则不太一样 // 第二个参数是spark streaming类型作业比较有特色的一个参数 // 实时处理batch的interval // spark streaming,每隔一小段时间,会去收集一次数据源(kafka)中的数据,做成一个batch // 每次都是处理一个batch中的数据 // 通常来说,batch interval,就是指每隔多少时间收集一次数据源中的数据,然后进行处理 // 一遍spark streaming的应用,都是设置数秒到数十秒(很少会超过1分钟) // 咱们这里项目中,就设置5秒钟的batch interval // 每隔5秒钟,咱们的spark streaming作业就会收集最近5秒内的数据源接收过来的数据 JavaStreamingContext jssc = new JavaStreamingContext( conf, Durations.seconds(5)); jssc.checkpoint("hdfs://120.77.155.220:9000/streaming_checkpoint"); // 正式开始进行代码的编写 // 实现咱们需要的实时计算的业务逻辑和功能 // 创建针对Kafka数据来源的输入DStream(离线流,代表了一个源源不断的数据来源,抽象) // 选用kafka direct api(很多好处,包括自己内部自适应调整每次接收数据量的特性,等等) // 构建kafka参数map // 主要要放置的就是,你要连接的kafka集群的地址(broker集群的地址列表) Map<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", ConfigurationManager.getProperty(Constants.KAFKA_METADATA_BROKER_LIST)); // 构建topic set String kafkaTopics = ConfigurationManager.getProperty(Constants.KAFKA_TOPICS); String[] kafkaTopicsSplited = kafkaTopics.split(","); Set<String> topics = new HashSet<String>(); for (String kafkaTopic : kafkaTopicsSplited) { topics.add(kafkaTopic); } // 基于kafka direct api模式,构建出了针对kafka集群中指定topic的输入DStream // 两个值,val1,val2;val1没有什么特殊的意义;val2中包含了kafka topic中的一条一条的实时日志数据 JavaPairInputDStream<String, String> adRealTimeLogDStream = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics); // adRealTimeLogDStream.repartition(1000); // 根据动态黑名单进行数据过滤 JavaPairDStream<String, String> filteredAdRealTimeLogDStream = filterByBlacklist(adRealTimeLogDStream); // 生成动态黑名单 generateDynamicBlacklist(filteredAdRealTimeLogDStream); // 业务功能一:计算广告点击流量实时统计结果(yyyyMMdd_province_city_adid,clickCount) // 最粗 JavaPairDStream<String, Long> adRealTimeStatDStream = calculateRealTimeStat( filteredAdRealTimeLogDStream); // 业务功能二:实时统计每天每个省份top3热门广告 // 统计的稍微细一些了 calculateProvinceTop3Ad(adRealTimeStatDStream); // 业务功能三:实时统计每天每个广告在最近1小时的滑动窗口内的点击趋势(每分钟的点击量) // 统计的非常细了 // 我们每次都可以看到每个广告,最近一小时内,每分钟的点击量 // 每支广告的点击趋势 calculateAdClickCountByWindow(adRealTimeLogDStream); // 构建完spark streaming上下文之后,记得要进行上下文的启动、等待执行结束、关闭 jssc.start(); jssc.awaitTermination(); jssc.close(); }
Example #13
Source File: JavaKafkaDirectWordCount.java From SparkDemo with MIT License | 4 votes |
/** * 1.一对一 * 2.高效 * 3.准确的只计算一次 * * @param args */ public static void main(String[] args) { StreamingExamples.setStreamingLogLevels(); SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaDirectWordCount").setMaster("local[1]"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(6)); Map<String, String> kafkaParams = new HashMap<String, String>(); // key是topic名称,value是线程数量 kafkaParams.put("metadata.broker.list", "master:9092,slave1:9092,slave2:9092"); // 指定broker在哪 HashSet<String> topicsSet = new HashSet<String>(); topicsSet.add("2017-7-26"); // 指定操作的topic // Create direct kafka stream with brokers and topics createDirectStream() JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet ); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Lists.newArrayList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); jssc.start(); try { jssc.awaitTermination(); } catch (Exception e) { e.printStackTrace(); } }
Example #14
Source File: JavaKafkaReceiverWordCount.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { StreamingExamples.setStreamingLogLevels(); SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaReceiverWordCount").setMaster("local[4]"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(6)); Map<String, Integer> topicMap = new HashMap<String, Integer>(); // key是topic名称,value是线程数量 topicMap.put("2017-7-26", 1); String zookeeperList = "master:2181,slave1:2181,slave2:2181"; JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, zookeeperList, "JavaKafkaReceiverWordCount", topicMap); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Lists.newArrayList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); jssc.start(); try { jssc.awaitTermination(); } catch (Exception e) { e.printStackTrace(); } }
Example #15
Source File: JavaKafkaWordCount.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 4) { System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount"); // Create the context with 2 seconds batch size JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000)); int numThreads = Integer.parseInt(args[3]); Map<String, Integer> topicMap = new HashMap<>(); String[] topics = args[2].split(","); for (String topic: topics) { topicMap.put(topic, numThreads); } JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, args[0], args[1], topicMap); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); jssc.start(); jssc.awaitTermination(); }
Example #16
Source File: JavaDirectKafkaWordCount.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); String brokers = args[0]; String topics = args[1]; // Create context with a 2 seconds batch interval SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", brokers); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet ); // Get the lines, split them into words, count the words and print JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); // Start the computation jssc.start(); jssc.awaitTermination(); }
Example #17
Source File: KafkaSource08.java From sylph with Apache License 2.0 | 4 votes |
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig08 config, SourceContext context) { String topics = requireNonNull(config.getTopics(), "topics not setting"); String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器 String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字 String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting"); Map<String, String> otherConfig = config.getOtherConfig().entrySet() .stream() .filter(x -> x.getValue() != null) .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString())); Map<String, String> kafkaParams = new HashMap<>(otherConfig); kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers); //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest smallest //----get fromOffsets @SuppressWarnings("unchecked") scala.collection.immutable.Map<String, String> map = (scala.collection.immutable.Map<String, String>) Map$.MODULE$.apply(JavaConverters.mapAsScalaMapConverter(kafkaParams).asScala().toSeq()); final KafkaCluster kafkaCluster = new KafkaCluster(map); Map<TopicAndPartition, Long> fromOffsets = getFromOffset(kafkaCluster, topics, groupId); //--- createDirectStream DirectKafkaInputDStream.class org.apache.spark.api.java.function.Function<MessageAndMetadata<byte[], byte[]>, ConsumerRecord<byte[], byte[]>> messageHandler = mmd -> new ConsumerRecord<>(mmd.topic(), mmd.partition(), mmd.key(), mmd.message(), mmd.offset()); @SuppressWarnings("unchecked") Class<ConsumerRecord<byte[], byte[]>> recordClass = (Class<ConsumerRecord<byte[], byte[]>>) ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class).runtimeClass(); JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(ssc, byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class, recordClass, kafkaParams, fromOffsets, messageHandler ); JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = settingCommit(inputStream, kafkaParams, kafkaCluster, groupId); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return dStream .map(record -> { return jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset()); }); } else { StructType structType = schemaToSparkType(context.getSchema()); return dStream .map(record -> { String[] names = structType.names(); Object[] values = new Object[names.length]; for (int i = 0; i < names.length; i++) { switch (names[i]) { case "_topic": values[i] = record.topic(); continue; case "_message": values[i] = new String(record.value(), UTF_8); continue; case "_key": values[i] = new String(record.key(), UTF_8); continue; case "_partition": values[i] = record.partition(); continue; case "_offset": values[i] = record.offset(); default: values[i] = null; } } return (Row) new GenericRowWithSchema(values, structType); }); //.window(Duration(10 * 1000)) } }
Example #18
Source File: SparkKafkaTest.java From BigDataPlatform with GNU General Public License v3.0 | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 4) { System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>"); System.exit(1); } SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("JavaKafkaWordCount"); // 咱们这里项目中,就设置5秒钟的batch interval // 每隔5秒钟,咱们的spark streaming作业就会收集最近5秒内的数据源接收过来的数据 JavaStreamingContext jssc = new JavaStreamingContext( conf, Durations.seconds(5)); jssc.checkpoint("hdfs://Master:9000/streaming_checkpoint"); // 正式开始进行代码的编写 // 实现咱们需要的实时计算的业务逻辑和功能 // 创建针对Kafka数据来源的输入DStream(离线流,代表了一个源源不断的数据来源,抽象) // 选用kafka direct api(很多好处,包括自己内部自适应调整每次接收数据量的特性,等等) // 构建kafka参数map // 主要要放置的就是,你要连接的kafka集群的地址(broker集群的地址列表) Map<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", ConfigurationManager.getProperty(Constants.KAFKA_METADATA_BROKER_LIST)); // 构建topic set String kafkaTopics = "streamingtopic";//ConfigurationManager.getProperty("streamingtopic"); String[] kafkaTopicsSplited = kafkaTopics.split(","); Set<String> topics = new HashSet<String>(); for (String kafkaTopic : kafkaTopicsSplited) { topics.add(kafkaTopic); } // 基于kafka direct api模式,构建出了针对kafka集群中指定topic的输入DStream // 两个值,val1,val2;val1没有什么特殊的意义;val2中包含了kafka topic中的一条一条的实时日志数据 JavaPairInputDStream<String, String> adRealTimeLogDStream = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics); JavaDStream<String> lines = adRealTimeLogDStream.map(Tuple2::_2); JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator()); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1)) .reduceByKey((i1, i2) -> i1 + i2); wordCounts.print(); jssc.start(); jssc.awaitTermination(); }