org.apache.spark.streaming.kafka010.CanCommitOffsets Java Examples
The following examples show how to use
org.apache.spark.streaming.kafka010.CanCommitOffsets.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 6 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI) .set("spark.streaming.kafka.consumer.cache.enabled", "false"); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); streamingContext.checkpoint(CHECKPOINT_FOLDER); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<Long> countStream = meetupStream.countByWindow( new Duration(WINDOW_LENGTH_MS), new Duration(SLIDING_INTERVAL_MS)); countStream.foreachRDD((JavaRDD<Long> countRDD) -> { MongoSpark.save( countRDD.map( r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}") ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example #2
Source File: DStreamUtil.java From sylph with Apache License 2.0 | 6 votes |
public static void dstreamAction(JavaDStream<Row> stream, Sink<JavaRDD<Row>> sink) { DStream<?> fristDStream = getFristDStream(stream.dstream()); logger.info("数据源驱动:{}", fristDStream.getClass().getName()); if ("DirectKafkaInputDStream".equals(fristDStream.getClass().getSimpleName())) { logger.info("发现job 数据源是kafka,将开启空job优化 且 自动上报offect"); stream.foreachRDD(rdd -> { RDD<?> kafkaRdd = getFristRdd(rdd.rdd()); //rdd.dependencies(0).rdd OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); if (kafkaRdd.count() > 0) { sink.run(rdd); //执行业务操作 } ((CanCommitOffsets) fristDStream).commitAsync(offsetRanges); }); } else { //非kafka数据源 暂时无法做任何优化 stream.foreachRDD(sink::run); } }
Example #3
Source File: SparkMLTrainingAndScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); JavaDStream<String> meetupStreamValues = meetupStream.map(v -> { return v.value(); }); // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn]) // Where n is the number of features, y is a binary label, // and n must be the same for train and test. // e.g. "(response, [group_lat, group_long])"; JavaDStream<String> trainData = meetupStreamValues.map(e -> { JSONParser jsonParser = new JSONParser(); JSONObject json = (JSONObject)jsonParser.parse(e); String result = "(" + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") + ((JSONObject)json.get("group")).get("group_lat") + "," + ((JSONObject)json.get("group")).get("group_lon") + "])"; return result; }); trainData.print(); JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse); StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(2)); streamingLogisticRegressionWithSGD.trainOn(labeledPoints); JavaPairDStream<Double, Vector> values = labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features())); streamingLogisticRegressionWithSGD.predictOnValues(values).print(); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example #4
Source File: StreamingRsvpsDStream.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<ConsumerRecord<String, String>> rsvpsWithGuestsStream = meetupStream.filter(f -> !f.value().contains("\"guests\":0")); rsvpsWithGuestsStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> r) -> { MongoSpark.save( r.map( e -> Document.parse(e.value()) ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example #5
Source File: KafkaSource.java From sylph with Apache License 2.0 | 4 votes |
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig config, SourceContext context) { String topics = config.getTopics(); String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器 String groupId = config.getGroupid(); //消费者的名字 String offsetMode = config.getOffsetMode(); Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig()); kafkaParams.put("bootstrap.servers", brokers); kafkaParams.put("key.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("value.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("enable.auto.commit", false); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put("group.id", groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put("auto.offset.reset", offsetMode); //latest earliest List<String> topicSets = Arrays.asList(topics.split(",")); JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream( ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicSets, kafkaParams)); DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream()) { @Override public void commitOffsets(RDD<?> kafkaRdd) { OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); log().info("commitKafkaOffsets {}", (Object) offsetRanges); DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream()); ((CanCommitOffsets) firstDStream).commitAsync(offsetRanges); } }; JavaDStream<ConsumerRecord<byte[], byte[]>> javaDStream = new JavaDStream<>(sylphKafkaOffset, ClassTag$.MODULE$.apply(ConsumerRecord.class)); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return javaDStream .map(record -> jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset())); } else { List<String> names = context.getSchema().getFieldNames(); return javaDStream .map(record -> { Object[] values = new Object[names.size()]; for (int i = 0; i < names.size(); i++) { switch (names.get(i)) { case "_topic": values[i] = record.topic(); continue; case "_message": values[i] = new String(record.value(), UTF_8); continue; case "_key": values[i] = record.key() == null ? null : new String(record.key(), UTF_8); continue; case "_partition": values[i] = record.partition(); continue; case "_offset": values[i] = record.offset(); case "_timestamp": values[i] = record.timestamp(); case "_timestampType": values[i] = record.timestampType().id; default: values[i] = null; } } return new GenericRow(values); //GenericRowWithSchema }); //.window(Duration(10 * 1000)) } }