org.apache.spark.streaming.kafka010.OffsetRange Java Examples
The following examples show how to use
org.apache.spark.streaming.kafka010.OffsetRange.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 6 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI) .set("spark.streaming.kafka.consumer.cache.enabled", "false"); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); streamingContext.checkpoint(CHECKPOINT_FOLDER); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<Long> countStream = meetupStream.countByWindow( new Duration(WINDOW_LENGTH_MS), new Duration(SLIDING_INTERVAL_MS)); countStream.foreachRDD((JavaRDD<Long> countRDD) -> { MongoSpark.save( countRDD.map( r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}") ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example #2
Source File: DStreamUtil.java From sylph with Apache License 2.0 | 6 votes |
public static void dstreamAction(JavaDStream<Row> stream, Sink<JavaRDD<Row>> sink) { DStream<?> fristDStream = getFristDStream(stream.dstream()); logger.info("数据源驱动:{}", fristDStream.getClass().getName()); if ("DirectKafkaInputDStream".equals(fristDStream.getClass().getSimpleName())) { logger.info("发现job 数据源是kafka,将开启空job优化 且 自动上报offect"); stream.foreachRDD(rdd -> { RDD<?> kafkaRdd = getFristRdd(rdd.rdd()); //rdd.dependencies(0).rdd OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); if (kafkaRdd.count() > 0) { sink.run(rdd); //执行业务操作 } ((CanCommitOffsets) fristDStream).commitAsync(offsetRanges); }); } else { //非kafka数据源 暂时无法做任何优化 stream.foreachRDD(sink::run); } }
Example #3
Source File: UpdateOffsetsFn.java From oryx with Apache License 2.0 | 5 votes |
/** * @param javaRDD RDD whose underlying RDD must be an instance of {@code HasOffsetRanges}, * such as {@code KafkaRDD} */ @Override public void call(JavaRDD<T> javaRDD) { OffsetRange[] ranges = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges(); Map<Pair<String,Integer>,Long> newOffsets = new HashMap<>(ranges.length); for (OffsetRange range : ranges) { newOffsets.put(new Pair<>(range.topic(), range.partition()), range.untilOffset()); } log.info("Updating offsets: {}", newOffsets); KafkaUtils.setOffsets(inputTopicLockMaster, group, newOffsets); }
Example #4
Source File: KafkaOffsetManagerImpl.java From datacollector with Apache License 2.0 | 5 votes |
private Map<Integer, Long> getOffsetToSave(OffsetRange[] offsetRanges) { Map<Integer, Long> partitionToOffset = new LinkedHashMap<>(); for (int i = 0; i < offsetRanges.length; i++) { //Until offset is the offset till which the current SparkDriverFunction //Will read partitionToOffset.put(offsetRanges[i].partition(), offsetRanges[i].untilOffset()); LOG.info( "Offset Range From RDD - Partition:{}, From Offset:{}, Until Offset:{}", offsetRanges[i].partition(), offsetRanges[i].fromOffset(), offsetRanges[i].untilOffset() ); } return partitionToOffset; }
Example #5
Source File: TestKafkaInput.java From envelope with Apache License 2.0 | 5 votes |
@Test public void testKafkaManagedOffsets() throws Exception { new MockUp<KafkaInput>() { @Mock public JavaDStream<?> getDStream() { return jDStream; } }; new Expectations() { { config.getStringList(KafkaInput.TOPICS_CONFIG); returns(Lists.newArrayList("foo")); config.hasPath(KafkaInput.GROUP_ID_CONFIG); returns(true); config.hasPath(KafkaInput.OFFSETS_OUTPUT_CONFIG); returns(false); javaRDD.rdd(); returns(kafkaRDD); kafkaRDD.offsetRanges(); returns(new OffsetRange[]{}); jDStream.dstream(); returns(dkiDStream); } }; mockedKafkaInput.configure(config); mockedKafkaInput.recordProgress(javaRDD); new Verifications() { { dkiDStream.commitAsync(new OffsetRange[]{}); times = 1; } }; }
Example #6
Source File: AvroKafkaSource.java From hudi with Apache License 2.0 | 5 votes |
@Override protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) { OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit); long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); if (totalNewMsgs <= 0) { return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges)); } JavaRDD<GenericRecord> newDataRDD = toRDD(offsetRanges); return new InputBatch<>(Option.of(newDataRDD), KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges)); }
Example #7
Source File: JsonKafkaSource.java From hudi with Apache License 2.0 | 5 votes |
@Override protected InputBatch<JavaRDD<String>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) { OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit); long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); if (totalNewMsgs <= 0) { return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges)); } JavaRDD<String> newDataRDD = toRDD(offsetRanges); return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges)); }
Example #8
Source File: KafkaOffsetGen.java From hudi with Apache License 2.0 | 5 votes |
/** * String representation of checkpoint * <p> * Format: topic1,0:offset0,1:offset1,2:offset2, ..... */ public static String offsetsToStr(OffsetRange[] ranges) { StringBuilder sb = new StringBuilder(); // at least 1 partition will be present. sb.append(ranges[0].topic() + ","); sb.append(Arrays.stream(ranges).map(r -> String.format("%s:%d", r.partition(), r.untilOffset())) .collect(Collectors.joining(","))); return sb.toString(); }
Example #9
Source File: KafkaOffsetGen.java From hudi with Apache License 2.0 | 4 votes |
public static long totalNewMessages(OffsetRange[] ranges) { return Arrays.stream(ranges).mapToLong(OffsetRange::count).sum(); }
Example #10
Source File: KafkaOffsetGen.java From hudi with Apache License 2.0 | 4 votes |
public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit) { // Obtain current metadata for the topic Map<TopicPartition, Long> fromOffsets; Map<TopicPartition, Long> toOffsets; try (KafkaConsumer consumer = new KafkaConsumer(kafkaParams)) { List<PartitionInfo> partitionInfoList; partitionInfoList = consumer.partitionsFor(topicName); Set<TopicPartition> topicPartitions = partitionInfoList.stream() .map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet()); // Determine the offset ranges to read from if (lastCheckpointStr.isPresent() && !lastCheckpointStr.get().isEmpty()) { fromOffsets = checkupValidOffsets(consumer, lastCheckpointStr, topicPartitions); } else { KafkaResetOffsetStrategies autoResetValue = KafkaResetOffsetStrategies .valueOf(props.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET.toString()).toUpperCase()); switch (autoResetValue) { case EARLIEST: fromOffsets = consumer.beginningOffsets(topicPartitions); break; case LATEST: fromOffsets = consumer.endOffsets(topicPartitions); break; default: throw new HoodieNotSupportedException("Auto reset value must be one of 'earliest' or 'latest' "); } } // Obtain the latest offsets. toOffsets = consumer.endOffsets(topicPartitions); } // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events) long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP, Config.maxEventsFromKafkaSource); maxEventsToReadFromKafka = (maxEventsToReadFromKafka == Long.MAX_VALUE || maxEventsToReadFromKafka == Integer.MAX_VALUE) ? Config.maxEventsFromKafkaSource : maxEventsToReadFromKafka; long numEvents = sourceLimit == Long.MAX_VALUE ? maxEventsToReadFromKafka : sourceLimit; if (numEvents < toOffsets.size()) { throw new HoodieException("sourceLimit should not be less than the number of kafka partitions"); } return CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents); }
Example #11
Source File: KafkaOffsetGen.java From hudi with Apache License 2.0 | 4 votes |
/** * Compute the offset ranges to read from Kafka, while handling newly added partitions, skews, event limits. * * @param fromOffsetMap offsets where we left off last time * @param toOffsetMap offsets of where each partitions is currently at * @param numEvents maximum number of events to read. */ public static OffsetRange[] computeOffsetRanges(Map<TopicPartition, Long> fromOffsetMap, Map<TopicPartition, Long> toOffsetMap, long numEvents) { Comparator<OffsetRange> byPartition = Comparator.comparing(OffsetRange::partition); // Create initial offset ranges for each 'to' partition, with from = to offsets. OffsetRange[] ranges = new OffsetRange[toOffsetMap.size()]; toOffsetMap.keySet().stream().map(tp -> { long fromOffset = fromOffsetMap.getOrDefault(tp, 0L); return OffsetRange.create(tp, fromOffset, fromOffset); }).sorted(byPartition).collect(Collectors.toList()).toArray(ranges); long allocedEvents = 0; Set<Integer> exhaustedPartitions = new HashSet<>(); // keep going until we have events to allocate and partitions still not exhausted. while (allocedEvents < numEvents && exhaustedPartitions.size() < toOffsetMap.size()) { long remainingEvents = numEvents - allocedEvents; long eventsPerPartition = (long) Math.ceil((1.0 * remainingEvents) / (toOffsetMap.size() - exhaustedPartitions.size())); // Allocate the remaining events to non-exhausted partitions, in round robin fashion for (int i = 0; i < ranges.length; i++) { OffsetRange range = ranges[i]; if (!exhaustedPartitions.contains(range.partition())) { long toOffsetMax = toOffsetMap.get(range.topicPartition()); long toOffset = Math.min(toOffsetMax, range.untilOffset() + eventsPerPartition); if (toOffset == toOffsetMax) { exhaustedPartitions.add(range.partition()); } allocedEvents += toOffset - range.untilOffset(); // We need recompute toOffset if allocedEvents larger than numEvents. if (allocedEvents > numEvents) { long offsetsToAdd = Math.min(eventsPerPartition, (numEvents - allocedEvents)); toOffset = Math.min(toOffsetMax, toOffset + offsetsToAdd); } ranges[i] = OffsetRange.create(range.topicPartition(), range.fromOffset(), toOffset); } } } return ranges; }
Example #12
Source File: JsonKafkaSource.java From hudi with Apache License 2.0 | 4 votes |
private JavaRDD<String> toRDD(OffsetRange[] offsetRanges) { return KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(x -> (String) x.value()); }
Example #13
Source File: AvroKafkaSource.java From hudi with Apache License 2.0 | 4 votes |
private JavaRDD<GenericRecord> toRDD(OffsetRange[] offsetRanges) { return KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> (GenericRecord) obj.value()); }
Example #14
Source File: TestKafkaSource.java From hudi with Apache License 2.0 | 4 votes |
@Test public void testComputeOffsetRanges() { // test totalNewMessages() long totalMsgs = CheckpointUtils.totalNewMessages(new OffsetRange[] {OffsetRange.apply(TEST_TOPIC_NAME, 0, 0, 100), OffsetRange.apply(TEST_TOPIC_NAME, 0, 100, 200)}); assertEquals(200, totalMsgs); // should consume all the full data OffsetRange[] ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), makeOffsetMap(new int[] {0, 1}, new long[] {300000, 350000}), 1000000L); assertEquals(200000, CheckpointUtils.totalNewMessages(ranges)); // should only consume upto limit ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), makeOffsetMap(new int[] {0, 1}, new long[] {300000, 350000}), 10000); assertEquals(10000, CheckpointUtils.totalNewMessages(ranges)); assertEquals(200000, ranges[0].fromOffset()); assertEquals(205000, ranges[0].untilOffset()); assertEquals(250000, ranges[1].fromOffset()); assertEquals(255000, ranges[1].untilOffset()); // should also consume from new partitions. ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), makeOffsetMap(new int[] {0, 1, 2}, new long[] {300000, 350000, 100000}), 1000000L); assertEquals(300000, CheckpointUtils.totalNewMessages(ranges)); assertEquals(3, ranges.length); // for skewed offsets, does not starve any partition & can catch up ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), makeOffsetMap(new int[] {0, 1, 2}, new long[] {200010, 350000, 10000}), 100000); assertEquals(100000, CheckpointUtils.totalNewMessages(ranges)); assertEquals(10, ranges[0].count()); assertEquals(89990, ranges[1].count()); assertEquals(10000, ranges[2].count()); ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), makeOffsetMap(new int[] {0, 1, 2}, new long[] {200010, 350000, 10000}), 1000000); assertEquals(110010, CheckpointUtils.totalNewMessages(ranges)); assertEquals(10, ranges[0].count()); assertEquals(100000, ranges[1].count()); assertEquals(10000, ranges[2].count()); // not all partitions consume same entries. ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1, 2, 3, 4}, new long[] {0, 0, 0, 0, 0}), makeOffsetMap(new int[] {0, 1, 2, 3, 4}, new long[] {100, 1000, 1000, 1000, 1000}), 1001); assertEquals(1001, CheckpointUtils.totalNewMessages(ranges)); assertEquals(100, ranges[0].count()); assertEquals(226, ranges[1].count()); assertEquals(226, ranges[2].count()); assertEquals(226, ranges[3].count()); assertEquals(223, ranges[4].count()); }
Example #15
Source File: KafkaSource.java From sylph with Apache License 2.0 | 4 votes |
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig config, SourceContext context) { String topics = config.getTopics(); String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器 String groupId = config.getGroupid(); //消费者的名字 String offsetMode = config.getOffsetMode(); Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig()); kafkaParams.put("bootstrap.servers", brokers); kafkaParams.put("key.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("value.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("enable.auto.commit", false); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put("group.id", groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put("auto.offset.reset", offsetMode); //latest earliest List<String> topicSets = Arrays.asList(topics.split(",")); JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream( ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicSets, kafkaParams)); DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream()) { @Override public void commitOffsets(RDD<?> kafkaRdd) { OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); log().info("commitKafkaOffsets {}", (Object) offsetRanges); DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream()); ((CanCommitOffsets) firstDStream).commitAsync(offsetRanges); } }; JavaDStream<ConsumerRecord<byte[], byte[]>> javaDStream = new JavaDStream<>(sylphKafkaOffset, ClassTag$.MODULE$.apply(ConsumerRecord.class)); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return javaDStream .map(record -> jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset())); } else { List<String> names = context.getSchema().getFieldNames(); return javaDStream .map(record -> { Object[] values = new Object[names.size()]; for (int i = 0; i < names.size(); i++) { switch (names.get(i)) { case "_topic": values[i] = record.topic(); continue; case "_message": values[i] = new String(record.value(), UTF_8); continue; case "_key": values[i] = record.key() == null ? null : new String(record.key(), UTF_8); continue; case "_partition": values[i] = record.partition(); continue; case "_offset": values[i] = record.offset(); case "_timestamp": values[i] = record.timestamp(); case "_timestampType": values[i] = record.timestampType().id; default: values[i] = null; } } return new GenericRow(values); //GenericRowWithSchema }); //.window(Duration(10 * 1000)) } }
Example #16
Source File: StreamingRsvpsDStream.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<ConsumerRecord<String, String>> rsvpsWithGuestsStream = meetupStream.filter(f -> !f.value().contains("\"guests\":0")); rsvpsWithGuestsStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> r) -> { MongoSpark.save( r.map( e -> Document.parse(e.value()) ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example #17
Source File: SparkMLTrainingAndScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); JavaDStream<String> meetupStreamValues = meetupStream.map(v -> { return v.value(); }); // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn]) // Where n is the number of features, y is a binary label, // and n must be the same for train and test. // e.g. "(response, [group_lat, group_long])"; JavaDStream<String> trainData = meetupStreamValues.map(e -> { JSONParser jsonParser = new JSONParser(); JSONObject json = (JSONObject)jsonParser.parse(e); String result = "(" + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") + ((JSONObject)json.get("group")).get("group_lat") + "," + ((JSONObject)json.get("group")).get("group_lon") + "])"; return result; }); trainData.print(); JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse); StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(2)); streamingLogisticRegressionWithSGD.trainOn(labeledPoints); JavaPairDStream<Double, Vector> values = labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features())); streamingLogisticRegressionWithSGD.predictOnValues(values).print(); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }