org.apache.spark.streaming.dstream.DStream Java Examples

The following examples show how to use org.apache.spark.streaming.dstream.DStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DStreamUtil.java    From sylph with Apache License 2.0 6 votes vote down vote up
public static void dstreamAction(JavaDStream<Row> stream, Sink<JavaRDD<Row>> sink)
{
    DStream<?> fristDStream = getFristDStream(stream.dstream());
    logger.info("数据源驱动:{}", fristDStream.getClass().getName());

    if ("DirectKafkaInputDStream".equals(fristDStream.getClass().getSimpleName())) {
        logger.info("发现job 数据源是kafka,将开启空job优化 且 自动上报offect");
        stream.foreachRDD(rdd -> {
            RDD<?> kafkaRdd = getFristRdd(rdd.rdd()); //rdd.dependencies(0).rdd
            OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges();
            if (kafkaRdd.count() > 0) {
                sink.run(rdd); //执行业务操作
            }
            ((CanCommitOffsets) fristDStream).commitAsync(offsetRanges);
        });
    }
    else { //非kafka数据源 暂时无法做任何优化
        stream.foreachRDD(sink::run);
    }
}
 
Example #2
Source File: SparkStreamingSqlAnalyse.java    From sylph with Apache License 2.0 5 votes vote down vote up
public void build()
{
    JavaDStream<Row> inputStream = source.apply(null);
    SparkSession spark = SparkSession.builder().config(inputStream.context().sparkContext().getConf()).getOrCreate();

    if (isCompile) {
        logger.info("isCompile mode will checkDStream()");
        checkDStream(spark, sourceTableName, schema, handlers);
    }

    DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream(), SylphKafkaOffset.class);
    logger.info("source table {}, firstDStream is {}", sourceTableName, firstDStream);
    inputStream.foreachRDD(rdd -> {
        Dataset<Row> df = spark.createDataFrame(rdd, schema);
        df.createOrReplaceTempView(sourceTableName);
        //df.show()
        //if kafka0.10+ if("DirectKafkaInputDStream".equals(firstDStream.getClass().getSimpleName())) {}
        if (firstDStream instanceof SylphKafkaOffset) { //
            RDD<?> kafkaRdd = DStreamUtil.getFirstRdd(rdd.rdd()); //rdd.dependencies(0).rdd
            if (kafkaRdd.count() > 0) {
                handlers.forEach(x -> x.accept(spark)); //执行业务操作
            }
            //val offsetRanges = kafkaRdd.asInstanceOf[HasOffsetRanges].offsetRanges
            //firstDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
            ((SylphKafkaOffset<?>) firstDStream).commitOffsets(kafkaRdd);
        }
        else {
            handlers.forEach(x -> x.accept(spark));
        }
    });
}
 
Example #3
Source File: DStreamUtil.java    From sylph with Apache License 2.0 5 votes vote down vote up
public static DStream<?> getFirstDStream(DStream<?> stream, Class<? extends DStream> first)
{
    if (first != null && first.isInstance(stream)) {
        return stream;
    }
    if (stream.dependencies().isEmpty()) {
        return stream;
    }
    else {
        return getFirstDStream(stream.dependencies().head(), first);
    }
}
 
Example #4
Source File: SylphKafkaOffset.java    From sylph with Apache License 2.0 5 votes vote down vote up
@Override
public List<DStream<?>> dependencies()
{
    return List$.MODULE$.<DStream<?>>newBuilder()
            .$plus$eq(parent)
            .result();
}
 
Example #5
Source File: ProcessedOffsetManager.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) {
  ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag =
      ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag();
  JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset =
      new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag);
  jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() {
    @Override
    public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}
 
Example #6
Source File: DStreamUtil.java    From sylph with Apache License 2.0 5 votes vote down vote up
public static DStream<?> getFristDStream(DStream<?> stream)
{
    if (stream.dependencies().isEmpty()) {
        return stream;
    }
    else {
        return getFristDStream(stream.dependencies().head());
    }
}
 
Example #7
Source File: ProcessedOffsetManager.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
public static <T> DStream<Tuple2<String, Iterable<Long>>>  getPartitionOffset(
    DStream<MessageAndMetadata<T>> unionStreams, Properties props) {
  ClassTag<MessageAndMetadata<T>> messageMetaClassTag =
      ScalaUtil.<T>getMessageAndMetadataClassTag();
  JavaDStream<MessageAndMetadata<T>> javaDStream =
      new JavaDStream<MessageAndMetadata<T>>(unionStreams, messageMetaClassTag);
  JavaPairDStream<String, Iterable<Long>> partitonOffset = getPartitionOffset(javaDStream, props);
  return partitonOffset.dstream();
}
 
Example #8
Source File: SparkGroupAlsoByWindowViaWindowSet.java    From beam with Apache License 2.0 5 votes vote down vote up
private static void checkpointIfNeeded(
    final DStream<Tuple2<ByteArray, Tuple2<StateAndTimers, List<byte[]>>>> firedStream,
    final SerializablePipelineOptions options) {

  final Long checkpointDurationMillis = getBatchDuration(options);

  if (checkpointDurationMillis > 0) {
    firedStream.checkpoint(new Duration(checkpointDurationMillis));
  }
}
 
Example #9
Source File: SparkGroupAlsoByWindowViaWindowSet.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, InputT> PairDStreamFunctions<ByteArray, byte[]> buildPairDStream(
    final JavaDStream<WindowedValue<KV<K, InputT>>> inputDStream,
    final Coder<K> keyCoder,
    final Coder<WindowedValue<InputT>> wvCoder) {

  // we have to switch to Scala API to avoid Optional in the Java API, see: SPARK-4819.
  // we also have a broader API for Scala (access to the actual key and entire iterator).
  // we use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle and be in serialized form
  // for checkpointing.
  // for readability, we add comments with actual type next to byte[].
  // to shorten line length, we use:
  // ---- WV: WindowedValue
  // ---- Iterable: Itr
  // ---- AccumT: A
  // ---- InputT: I
  final DStream<Tuple2<ByteArray, byte[]>> tupleDStream =
      inputDStream
          .map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder))
          .dstream();

  return DStream.toPairDStreamFunctions(
      tupleDStream,
      JavaSparkContext$.MODULE$.fakeClassTag(),
      JavaSparkContext$.MODULE$.fakeClassTag(),
      null);
}
 
Example #10
Source File: SparkGroupAlsoByWindowViaWindowSet.java    From beam with Apache License 2.0 5 votes vote down vote up
public static <K, InputT, W extends BoundedWindow>
    JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> groupByKeyAndWindow(
        final JavaDStream<WindowedValue<KV<K, InputT>>> inputDStream,
        final Coder<K> keyCoder,
        final Coder<WindowedValue<InputT>> wvCoder,
        final WindowingStrategy<?, W> windowingStrategy,
        final SerializablePipelineOptions options,
        final List<Integer> sourceIds,
        final String transformFullName) {

  final PairDStreamFunctions<ByteArray, byte[]> pairDStream =
      buildPairDStream(inputDStream, keyCoder, wvCoder);

  // use updateStateByKey to scan through the state and update elements and timers.
  final UpdateStateByKeyFunction<K, InputT, W> updateFunc =
      new UpdateStateByKeyFunction<>(
          sourceIds,
          windowingStrategy,
          (FullWindowedValueCoder<InputT>) wvCoder,
          keyCoder,
          options,
          transformFullName);

  final DStream<
          Tuple2</*K*/ ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/ List<byte[]>>>>
      firedStream =
          pairDStream.updateStateByKey(
              updateFunc,
              pairDStream.defaultPartitioner(pairDStream.defaultPartitioner$default$1()),
              true,
              JavaSparkContext$.MODULE$.fakeClassTag());

  checkpointIfNeeded(firedStream, options);

  // filter state-only output (nothing to fire) and remove the state from the output.
  return stripStateValues(firedStream, keyCoder, (FullWindowedValueCoder<InputT>) wvCoder);
}
 
Example #11
Source File: SparkUnboundedSource.java    From beam with Apache License 2.0 5 votes vote down vote up
ReadReportDStream(
    DStream<Metadata> parent, int inputDStreamId, String sourceName, String stepName) {
  super(parent.ssc(), JavaSparkContext$.MODULE$.fakeClassTag());
  this.parent = parent;
  this.inputDStreamId = inputDStreamId;
  this.sourceName = sourceName;
  this.stepName = stepName;
}
 
Example #12
Source File: ReceiverLauncher.java    From kafka-spark-consumer with Apache License 2.0 4 votes vote down vote up
public static DStream<MessageAndMetadata<byte[]>> launch(
    StreamingContext ssc, Properties pros, int numberOfReceivers, StorageLevel storageLevel) {
  JavaStreamingContext jsc = new JavaStreamingContext(ssc);
  return createStream(jsc, pros, numberOfReceivers, storageLevel, new IdentityMessageHandler()).dstream();
}
 
Example #13
Source File: ReceiverLauncher.java    From kafka-spark-consumer with Apache License 2.0 4 votes vote down vote up
public static <E> DStream<MessageAndMetadata<E>> launch(
        StreamingContext ssc, Properties pros, int numberOfReceivers,
        StorageLevel storageLevel, KafkaMessageHandler<E> messageHandler) {
    JavaStreamingContext jsc = new JavaStreamingContext(ssc);
    return createStream(jsc, pros, numberOfReceivers, storageLevel, messageHandler).dstream();
}
 
Example #14
Source File: SparkUnboundedSource.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public scala.collection.immutable.List<DStream<?>> dependencies() {
  return scala.collection.JavaConversions.asScalaBuffer(
          Collections.<DStream<?>>singletonList(parent))
      .toList();
}
 
Example #15
Source File: FraudDetectionApp.java    From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {

        String brokers = "localhost:9092";
        String topics = "iplog";
        CacheIPLookup cacheIPLookup = new CacheIPLookup();
        SparkConf sparkConf = new SparkConf().setAppName("IP_FRAUD");
        JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(2));

        Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
        Map<String, String> kafkaConfiguration = new HashMap<>();
        kafkaConfiguration.put("metadata.broker.list", brokers);
        kafkaConfiguration.put("group.id", "ipfraud");
        kafkaConfiguration.put("auto.offset.reset", "smallest");

        JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
                javaStreamingContext,
                String.class,
                String.class,
                StringDecoder.class,
                StringDecoder.class,
                kafkaConfiguration,
                topicsSet
        );

        JavaDStream<String> lines = messages.map(Tuple2::_2);

        JavaDStream<String> fraudIPs = lines.filter(new Function<String, Boolean>() {
            @Override
            public Boolean call(String s) throws Exception {
                String IP = s.split(" ")[0];
                String[] ranges = IP.split("\\.");
                String range = null;
                try {
                    range = ranges[0] + "." + ranges[1];
                } catch (ArrayIndexOutOfBoundsException ex) {

                }
                return cacheIPLookup.isFraudIP(range);

            }
        });

        DStream<String> fraudDstream = fraudIPs.dstream();
        fraudDstream.saveAsTextFiles("FraudRecord", "");

        javaStreamingContext.start();
        javaStreamingContext.awaitTermination();
    }
 
Example #16
Source File: KafkaSource.java    From sylph with Apache License 2.0 4 votes vote down vote up
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig config, SourceContext context)
{
    String topics = config.getTopics();
    String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器
    String groupId = config.getGroupid(); //消费者的名字
    String offsetMode = config.getOffsetMode();

    Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig());
    kafkaParams.put("bootstrap.servers", brokers);
    kafkaParams.put("key.deserializer", ByteArrayDeserializer.class); //StringDeserializer
    kafkaParams.put("value.deserializer", ByteArrayDeserializer.class); //StringDeserializer
    kafkaParams.put("enable.auto.commit", false); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put("group.id", groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put("auto.offset.reset", offsetMode); //latest   earliest

    List<String> topicSets = Arrays.asList(topics.split(","));
    JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(
            ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicSets, kafkaParams));

    DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream())
    {
        @Override
        public void commitOffsets(RDD<?> kafkaRdd)
        {
            OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges();
            log().info("commitKafkaOffsets {}", (Object) offsetRanges);
            DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream());
            ((CanCommitOffsets) firstDStream).commitAsync(offsetRanges);
        }
    };

    JavaDStream<ConsumerRecord<byte[], byte[]>> javaDStream = new JavaDStream<>(sylphKafkaOffset, ClassTag$.MODULE$.apply(ConsumerRecord.class));
    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return javaDStream
                .map(record -> jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset()));
    }
    else {
        List<String> names = context.getSchema().getFieldNames();
        return javaDStream
                .map(record -> {
                    Object[] values = new Object[names.size()];
                    for (int i = 0; i < names.size(); i++) {
                        switch (names.get(i)) {
                            case "_topic":
                                values[i] = record.topic();
                                continue;
                            case "_message":
                                values[i] = new String(record.value(), UTF_8);
                                continue;
                            case "_key":
                                values[i] = record.key() == null ? null : new String(record.key(), UTF_8);
                                continue;
                            case "_partition":
                                values[i] = record.partition();
                                continue;
                            case "_offset":
                                values[i] = record.offset();
                            case "_timestamp":
                                values[i] = record.timestamp();
                            case "_timestampType":
                                values[i] = record.timestampType().id;
                            default:
                                values[i] = null;
                        }
                    }
                    return new GenericRow(values);  //GenericRowWithSchema
                });  //.window(Duration(10 * 1000))
    }
}
 
Example #17
Source File: KafkaSource08.java    From sylph with Apache License 2.0 4 votes vote down vote up
private static JavaDStream<ConsumerRecord<byte[], byte[]>> settingCommit(
            JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream,
            Map<String, String> kafkaParams,
            KafkaCluster kafkaCluster,
            String groupId)
    {
        if (kafkaParams.getOrDefault("auto.commit.enable", "true").equals("false")) {
            return inputStream;
        }

        int commitInterval = Integer.parseInt(kafkaParams.getOrDefault(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "90000"));

        DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream())
        {
            private final KafkaOffsetCommitter kafkaOffsetCommitter = new KafkaOffsetCommitter(
                    kafkaCluster,
                    groupId,
                    commitInterval);

            @Override
            public void initialize(Time time)
            {
                super.initialize(time);
                kafkaOffsetCommitter.setName("Kafka_Offset_Committer");
                kafkaOffsetCommitter.start();
            }

            @Override
            public void commitOffsets(RDD<?> kafkaRdd)
            {
                OffsetRange[] offsets = ((HasOffsetRanges) kafkaRdd).offsetRanges();
//                Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets)
//                        .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset()));
                //log().info("commit Kafka Offsets {}", internalOffsets);
                kafkaOffsetCommitter.addAll(offsets);
            }
        };
        JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = new JavaDStream<>(
                sylphKafkaOffset, ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class));
        return dStream;
//        inputStream = inputStream.transform(rdd -> {
//            OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
//            Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets)
//                    .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset()));
//            commitKafkaOffsets(kafkaCluster, groupId, internalOffsets);
//            return rdd;
//        });
    }
 
Example #18
Source File: DStreamUtil.java    From sylph with Apache License 2.0 4 votes vote down vote up
public static DStream<?> getFirstDStream(DStream<?> stream)
{
    return getFirstDStream(stream, null);
}