org.apache.spark.streaming.api.java.JavaInputDStream Java Examples

The following examples show how to use org.apache.spark.streaming.api.java.JavaInputDStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

6 votes

public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI)
                .set("spark.streaming.kafka.consumer.cache.enabled", "false");

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        streamingContext.checkpoint(CHECKPOINT_FOLDER);

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<Long> countStream  
            = meetupStream.countByWindow(
                 new Duration(WINDOW_LENGTH_MS), 
                 new Duration(SLIDING_INTERVAL_MS));        

        countStream.foreachRDD((JavaRDD<Long> countRDD) -> {                
            MongoSpark.save(        
                    countRDD.map(
                        r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}")
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });
        
        streamingContext.start();
        streamingContext.awaitTermination();    
    }

Example #2

Source File: StreamingEngine.java From spark-streaming-direct-kafka with Apache License 2.0

6 votes

public void start() {
    SparkConf sparkConf = getSparkConf();
    streamingContext = new JavaStreamingContext(sparkConf,
            Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec())));
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext);
    JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message()));

    pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data
    dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager()));
    streamingContext.start();
}

Example #3

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testEsRDDWriteIndexCreationDisabled() throws Exception {
    ExpectingToThrow expecting = expectingToThrow(EsHadoopIllegalArgumentException.class).from(ssc);

    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-test-nonexisting-scala-basic-write", "data", version));

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put(ES_INDEX_AUTO_CREATE, "no");

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaInputDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue, true);
    // apply closure
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2); // Let the processing happen
    ssc.stop(false, true);

    assertTrue(!RestUtils.exists(target));
    expecting.assertExceptionFound();
}

Example #4

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testEsRDDWriteIndexCreationDisabled() throws Exception {
    ExpectingToThrow expecting = expectingToThrow(EsHadoopIllegalArgumentException.class).from(ssc);

    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-test-nonexisting-scala-basic-write", "data", version));

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put(ES_INDEX_AUTO_CREATE, "no");

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaInputDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue, true);
    // apply closure
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2); // Let the processing happen
    ssc.stop(false, true);

    assertTrue(!RestUtils.exists(target));
    expecting.assertExceptionFound();
}

Example #5

Source File: AbstractSparkLayer.java From oryx with Apache License 2.0

5 votes

protected final JavaInputDStream<ConsumerRecord<K,M>> buildInputDStream(
    JavaStreamingContext streamingContext) {

  Preconditions.checkArgument(
      KafkaUtils.topicExists(inputTopicLockMaster, inputTopic),
      "Topic %s does not exist; did you create it?", inputTopic);
  if (updateTopic != null && updateTopicLockMaster != null) {
    Preconditions.checkArgument(
        KafkaUtils.topicExists(updateTopicLockMaster, updateTopic),
        "Topic %s does not exist; did you create it?", updateTopic);
  }

  String groupID = getGroupID();

  Map<String,Object> kafkaParams = new HashMap<>();
  kafkaParams.put("group.id", groupID);
  // Don't re-consume old messages from input by default
  kafkaParams.put("auto.offset.reset", "latest"); // Ignored by Kafka 0.10 Spark integration
  kafkaParams.put("bootstrap.servers", inputBroker);
  kafkaParams.put("key.deserializer", keyDecoderClass.getName());
  kafkaParams.put("value.deserializer", messageDecoderClass.getName());

  LocationStrategy locationStrategy = LocationStrategies.PreferConsistent();
  ConsumerStrategy<K,M> consumerStrategy = ConsumerStrategies.Subscribe(
      Collections.singleton(inputTopic), kafkaParams, Collections.emptyMap());
  return org.apache.spark.streaming.kafka010.KafkaUtils.createDirectStream(
      streamingContext,
      locationStrategy,
      consumerStrategy);
}

Example #6

Source File: MapRStreaming22Binding.java From datacollector with Apache License 2.0

5 votes

@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  List<String> topics = ImmutableList.of(topic);
  if (!autoOffsetValue.isEmpty()) {
    props.put(SparkStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), MaprStreamsOffsetManagerImpl.get());
  return result;
}

Example #7

Source File: Kafka010SparkStreamingBinding.java From datacollector with Apache License 2.0

5 votes

@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  props.put("bootstrap.servers", metaDataBrokerList);
  if (!autoOffsetValue.isEmpty()) {
    autoOffsetValue = getConfigurableAutoOffsetResetIfNonEmpty(autoOffsetValue);
    props.put(AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  List<String> topics = ImmutableList.of(topic);
  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = KafkaOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), KafkaOffsetManagerImpl.get());
  return result;
}

Example #8

Source File: AbstractSparkLayer.java From spark-streaming-direct-kafka with Apache License 2.0

5 votes

public JavaInputDStream<MessageAndMetadata<String,byte[]>> buildInputDStream(
        JavaStreamingContext streamingContext) {

    HashMap<String, String> kafkaParams = config.getKafkaParams();

    // Ugly compiler-pleasing acrobatics:
    @SuppressWarnings("unchecked")
    Class<MessageAndMetadata<String, byte[]>> streamClass =
            (Class<MessageAndMetadata<String, byte[]>>) (Class<?>) MessageAndMetadata.class;

    if (!KafkaManager.topicExists(config.getZkKafka(), config.getTopic())) {
        throw new RuntimeException("Topic does not exist on server");
    }

    Map<TopicAndPartition, Long> seedOffsetsMap = KafkaManager.getOffsets(config.getZkKafka(),
            config.getZkOffsetManager(), config.getKafkaGroupId(), config.getTopic(), config.getKafkaParams());

    // TODO: try generics, instead of hardcoded values
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = org.apache.spark.streaming.kafka.KafkaUtils.createDirectStream(
            streamingContext,
            String.class,  // change as necessary
            byte[].class,  // change as necessary
            StringDecoder.class,
            DefaultDecoder.class,
            streamClass,
            kafkaParams,
            seedOffsetsMap,
            Functions.<MessageAndMetadata<String, byte[]>>identity());
    return dStream;
}

Example #9

Source File: MapRStreamingBinding.java From datacollector with Apache License 2.0

5 votes

@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  List<String> topics = ImmutableList.of(topic);
  if (!autoOffsetValue.isEmpty()) {
    props.put(SparkStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), MaprStreamsOffsetManagerImpl.get());
  return result;
}

Example #10

Source File: WordCountingAppWithCheckpoint.java From tutorials with MIT License

4 votes

public static void main(String[] args) throws InterruptedException {

        Logger.getLogger("org")
            .setLevel(Level.OFF);
        Logger.getLogger("akka")
            .setLevel(Level.OFF);

        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Collection<String> topics = Arrays.asList("messages");

        SparkConf sparkConf = new SparkConf();
        sparkConf.setMaster("local[2]");
        sparkConf.setAppName("WordCountingAppWithCheckpoint");
        sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");

        JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        sparkContext = streamingContext.sparkContext();

        streamingContext.checkpoint("./.checkpoint");

        JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));

        JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));

        JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());

        JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
            .iterator());

        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);

        JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> {
            int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
            Tuple2<String, Integer> output = new Tuple2<>(word, sum);
            state.update(sum);
            return output;
        }));

        cumulativeWordCounts.foreachRDD(javaRdd -> {
            List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
            for (Tuple2<String, Integer> tuple : wordCountList) {
                List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2));
                JavaRDD<Word> rdd = sparkContext.parallelize(wordList);
                javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
                    .saveToCassandra();
            }
        });

        streamingContext.start();
        streamingContext.awaitTermination();
    }

Example #11

Source File: SparkRunner.java From jaeger-analytics-java with Apache License 2.0

4 votes

public static void main(String []args) throws InterruptedException, IOException {
  HTTPServer server = new HTTPServer(Integer.valueOf(getPropOrEnv("PROMETHEUS_PORT", "9111")));

  SparkConf sparkConf = new SparkConf()
      .setAppName("Trace DSL")
      .setMaster(getPropOrEnv("SPARK_MASTER","local[*]"));

  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(Integer.parseInt(getPropOrEnv("SPARK_STREAMING_BATCH_DURATION", "5000"))));

  Set<String> topics = Collections.singleton(getPropOrEnv("KAFKA_JAEGER_TOPIC", "jaeger-spans"));
  Map<String, Object> kafkaParams = new HashMap<>();
  kafkaParams.put("bootstrap.servers", getPropOrEnv("KAFKA_BOOTSTRAP_SERVER", "localhost:9092"));
  kafkaParams.put("key.deserializer", StringDeserializer.class);
  kafkaParams.put("value.deserializer", ProtoSpanDeserializer.class);
  // hack to start always from beginning
  kafkaParams.put("group.id", "jaeger-trace-aggregation-" + System.currentTimeMillis());

  if (Boolean.parseBoolean(getPropOrEnv("KAFKA_START_FROM_BEGINNING", "true"))) {
    kafkaParams.put("auto.offset.reset", "earliest");
    kafkaParams.put("enable.auto.commit", false);
    kafkaParams.put("startingOffsets", "earliest");
  }

  JavaInputDStream<ConsumerRecord<String, Span>> messages =
      KafkaUtils.createDirectStream(
          ssc,
          LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams));

  JavaPairDStream<String, Span> traceIdSpanTuple = messages.mapToPair(record -> {
    return new Tuple2<>(record.value().traceId, record.value());
  });

 JavaDStream<Trace> tracesStream = traceIdSpanTuple.groupByKey().map(traceIdSpans -> {
   System.out.printf("traceID: %s\n", traceIdSpans._1);
    Iterable<Span> spans = traceIdSpans._2();
    Trace trace = new Trace();
    trace.traceId = traceIdSpans._1();
    trace.spans = StreamSupport.stream(spans.spliterator(), false)
        .collect(Collectors.toList());
    return trace;
  });

  MinimumClientVersion minimumClientVersion = MinimumClientVersion.builder()
      .withJavaVersion(getPropOrEnv("TRACE_QUALITY_JAVA_VERSION", "1.0.0"))
      .withGoVersion(getPropOrEnv("TRACE_QUALITY_GO_VERSION", "2.22.0"))
      .withNodeVersion(getPropOrEnv("TRACE_QUALITY_NODE_VERSION", "3.17.1"))
      .withPythonVersion(getPropOrEnv("TRACE_QUALITY_PYTHON_VERSION", "4.0.0"))
      .build();

  List<ModelRunner> modelRunner = Arrays.asList(
      new TraceHeight(),
      new ServiceDepth(),
      new ServiceHeight(),
      new NetworkLatency(),
      new NumberOfErrors(),
      new DirectDependencies(),
      // trace quality
      minimumClientVersion,
      new HasClientServerSpans(),
      new UniqueSpanId());

  tracesStream.foreachRDD((traceRDD, time) -> {
    traceRDD.foreach(trace -> {
      Graph graph = GraphCreator.create(trace);

      for (ModelRunner model: modelRunner) {
        model.runWithMetrics(graph);
      }
    });
  });

  ssc.start();
  ssc.awaitTermination();
}

Example #12

Source File: BatchLayer.java From oryx with Apache License 2.0

4 votes

public synchronized void start() {
  String id = getID();
  if (id != null) {
    log.info("Starting Batch Layer {}", id);
  }

  streamingContext = buildStreamingContext();
  JavaSparkContext sparkContext = streamingContext.sparkContext();
  Configuration hadoopConf = sparkContext.hadoopConfiguration();

  Path checkpointPath = new Path(new Path(modelDirString), ".checkpoint");
  log.info("Setting checkpoint dir to {}", checkpointPath);
  sparkContext.setCheckpointDir(checkpointPath.toString());

  log.info("Creating message stream from topic");
  JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext);
  JavaPairDStream<K,M> pairDStream =
      kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value()));

  Class<K> keyClass = getKeyClass();
  Class<M> messageClass = getMessageClass();
  pairDStream.foreachRDD(
      new BatchUpdateFunction<>(getConfig(),
                                keyClass,
                                messageClass,
                                keyWritableClass,
                                messageWritableClass,
                                dataDirString,
                                modelDirString,
                                loadUpdateInstance(),
                                streamingContext));

  // "Inline" saveAsNewAPIHadoopFiles to be able to skip saving empty RDDs
  pairDStream.foreachRDD(new SaveToHDFSFunction<>(
      dataDirString + "/oryx",
      "data",
      keyClass,
      messageClass,
      keyWritableClass,
      messageWritableClass,
      hadoopConf));

  // Must use the raw Kafka stream to get offsets
  kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster()));

  if (maxDataAgeHours != NO_MAX_AGE) {
    pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf,
                                                 dataDirString,
                                                 Pattern.compile("-(\\d+)\\."),
                                                 maxDataAgeHours));
  }
  if (maxModelAgeHours != NO_MAX_AGE) {
    pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf,
                                                 modelDirString,
                                                 Pattern.compile("(\\d+)"),
                                                 maxModelAgeHours));
  }

  log.info("Starting Spark Streaming");

  streamingContext.start();
}

Example #13

Source File: SpeedLayer.java From oryx with Apache License 2.0

4 votes

public synchronized void start() {
  String id = getID();
  if (id != null) {
    log.info("Starting Speed Layer {}", id);
  }

  streamingContext = buildStreamingContext();
  log.info("Creating message stream from topic");
  JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext);
  JavaPairDStream<K,M> pairDStream =
      kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value()));

  KafkaConsumer<String,U> consumer = new KafkaConsumer<>(
      ConfigUtils.keyValueToProperties(
          "group.id", "OryxGroup-" + getLayerName() + '-' + UUID.randomUUID(),
          "bootstrap.servers", updateBroker,
          "max.partition.fetch.bytes", maxMessageSize,
          "key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer",
          "value.deserializer", updateDecoderClass.getName(),
          // Do start from the beginning of the update queue
          "auto.offset.reset", "earliest"
      ));
  consumer.subscribe(Collections.singletonList(updateTopic));
  consumerIterator = new ConsumeDataIterator<>(consumer);

  modelManager = loadManagerInstance();
  Configuration hadoopConf = streamingContext.sparkContext().hadoopConfiguration();
  new Thread(LoggingCallable.log(() -> {
    try {
      modelManager.consume(consumerIterator, hadoopConf);
    } catch (Throwable t) {
      log.error("Error while consuming updates", t);
      close();
    }
  }).asRunnable(), "OryxSpeedLayerUpdateConsumerThread").start();

  pairDStream.foreachRDD(new SpeedLayerUpdate<>(modelManager, updateBroker, updateTopic));

  // Must use the raw Kafka stream to get offsets
  kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster()));

  log.info("Starting Spark Streaming");

  streamingContext.start();
}

Example #14

Source File: SimpleSparkStructuredKafkaStreamingCounter.java From jMetalSP with MIT License

4 votes

@Override
public void run() {

    ConsumerStrategy<Integer,Integer> consumerStrategy =ConsumerStrategies.Subscribe(topic,kafkaParams);
    LocationStrategy locationStrategy = LocationStrategies.PreferConsistent();

    JavaInputDStream<ConsumerRecord<Integer,Integer>> stream=
            (JavaInputDStream<ConsumerRecord<Integer,Integer>>)
            KafkaUtils.createDirectStream(streamingContext,
            locationStrategy,
                    consumerStrategy);



    JavaDStream<Integer> time=stream.map(value -> value.value() );
    /*time.foreachRDD(numbers->
            {
                numbers.foreach(value->
                {
                    System.out.println("Pruebas----> " + value);
                    observable.setChanged();
                    observable.notifyObservers(new SingleObservedData<Integer>(value));
                });
            }
    );*/


        time.foreachRDD(numbers -> {
            Integer cont = numbers.reduce((key, value) -> value);
            //System.out.println("Pruebas----> " + cont);
            observable.setChanged();
            observable.notifyObservers(new ObservedValue<Integer>(cont));
        });

    
   // stream.foreachRDD((consumerRecordJavaRDD, time) -> consumerRecordJavaRDD.foreach(integer -> {
        //observable.setChanged();
        //observable.notifyObservers(new SingleObservedData<Integer>(integer.value()));
  //      System.out.println("Pruebas----> "+integer.value());
//    }));

}

Example #15

Source File: SimpleSparkStructuredKafkaStreamingCounterAVRO.java From jMetalSP with MIT License

4 votes

@Override
public void run() {

    ConsumerStrategy<Integer,byte[]> consumerStrategy =ConsumerStrategies.Subscribe(topic,kafkaParams);
    LocationStrategy locationStrategy = LocationStrategies.PreferConsistent();

    JavaInputDStream<ConsumerRecord<Integer,byte []>> stream=
            (JavaInputDStream<ConsumerRecord<Integer,byte[]>>)
            KafkaUtils.createDirectStream(streamingContext,
            locationStrategy,
                    consumerStrategy);



    JavaDStream<Integer> time=stream.map(value -> {
        DataDeserializer<Counter> dataDeserializer = new DataDeserializer<>();
        //Object o =dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc");
        //GenericData.Record rc=(GenericData.Record)o;
        Counter counter = dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc");
        //Counter counter =  (Counter) dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc");
         //return (Integer) rc.get(0);
        return (Integer) counter.get(0);
    });
    /*time.foreachRDD(numbers->
            {
                numbers.foreach(value->
                {
                    System.out.println("Pruebas----> " + value);
                    observable.setChanged();
                    observable.notifyObservers(new SingleObservedData<Integer>(value));
                });
            }
    );*/


time.foreachRDD(numbers -> {
    Integer cont = numbers.reduce((key, value) -> value);
    System.out.println("Pruebas----> " + cont);
    observable.setChanged();
    observable.notifyObservers(new ObservedValue<Integer>(cont));
});


    
   // stream.foreachRDD((consumerRecordJavaRDD, time) -> consumerRecordJavaRDD.foreach(integer -> {
        //observable.setChanged();
        //observable.notifyObservers(new SingleObservedData<Integer>(integer.value()));
  //      System.out.println("Pruebas----> "+integer.value());
//    }));

}

Example #16

Source File: KafkaExample.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
  	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
 System.setProperty("hadoop.home.dir", "E:\\hadoop");
  	//Logger rootLogger = LogManager.getRootLogger();
 		//rootLogger.setLevel(Level.WARN); 
      SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");    
      JavaSparkContext sc = new JavaSparkContext(conf);
      JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
      streamingContext.checkpoint("E:\\hadoop\\checkpoint");
      Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
      Map<String, Object> kafkaParams = new HashMap<>();
      kafkaParams.put("bootstrap.servers", "10.0.75.1:9092");
      kafkaParams.put("key.deserializer", StringDeserializer.class);
      kafkaParams.put("value.deserializer", StringDeserializer.class);
      kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea");
      kafkaParams.put("auto.offset.reset", "latest");
     // kafkaParams.put("enable.auto.commit", false);

      Collection<String> topics = Arrays.asList("mytopic", "anothertopic");

      final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(),
      				ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));

      JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value()));
     
      pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));});
     
      JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText());
      
      tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x)));
      
     JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() );
 
      hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x)));
      
      JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue();
      
      cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30))
                .countByValue()
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30))
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2())));
      */
     hashtagRDD.window(Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     
     /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     
      streamingContext.start();
      try {
	streamingContext.awaitTermination();
} catch (InterruptedException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
  }

Example #17

Source File: KafkaSource.java From sylph with Apache License 2.0

4 votes

public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig config, SourceContext context)
{
    String topics = config.getTopics();
    String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器
    String groupId = config.getGroupid(); //消费者的名字
    String offsetMode = config.getOffsetMode();

    Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig());
    kafkaParams.put("bootstrap.servers", brokers);
    kafkaParams.put("key.deserializer", ByteArrayDeserializer.class); //StringDeserializer
    kafkaParams.put("value.deserializer", ByteArrayDeserializer.class); //StringDeserializer
    kafkaParams.put("enable.auto.commit", false); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put("group.id", groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put("auto.offset.reset", offsetMode); //latest   earliest

    List<String> topicSets = Arrays.asList(topics.split(","));
    JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(
            ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicSets, kafkaParams));

    DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream())
    {
        @Override
        public void commitOffsets(RDD<?> kafkaRdd)
        {
            OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges();
            log().info("commitKafkaOffsets {}", (Object) offsetRanges);
            DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream());
            ((CanCommitOffsets) firstDStream).commitAsync(offsetRanges);
        }
    };

    JavaDStream<ConsumerRecord<byte[], byte[]>> javaDStream = new JavaDStream<>(sylphKafkaOffset, ClassTag$.MODULE$.apply(ConsumerRecord.class));
    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return javaDStream
                .map(record -> jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset()));
    }
    else {
        List<String> names = context.getSchema().getFieldNames();
        return javaDStream
                .map(record -> {
                    Object[] values = new Object[names.size()];
                    for (int i = 0; i < names.size(); i++) {
                        switch (names.get(i)) {
                            case "_topic":
                                values[i] = record.topic();
                                continue;
                            case "_message":
                                values[i] = new String(record.value(), UTF_8);
                                continue;
                            case "_key":
                                values[i] = record.key() == null ? null : new String(record.key(), UTF_8);
                                continue;
                            case "_partition":
                                values[i] = record.partition();
                                continue;
                            case "_offset":
                                values[i] = record.offset();
                            case "_timestamp":
                                values[i] = record.timestamp();
                            case "_timestampType":
                                values[i] = record.timestampType().id;
                            default:
                                values[i] = null;
                        }
                    }
                    return new GenericRow(values);  //GenericRowWithSchema
                });  //.window(Duration(10 * 1000))
    }
}

Example #18

Source File: KafkaSource08.java From sylph with Apache License 2.0

4 votes

private static JavaDStream<ConsumerRecord<byte[], byte[]>> settingCommit(
            JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream,
            Map<String, String> kafkaParams,
            KafkaCluster kafkaCluster,
            String groupId)
    {
        if (kafkaParams.getOrDefault("auto.commit.enable", "true").equals("false")) {
            return inputStream;
        }

        int commitInterval = Integer.parseInt(kafkaParams.getOrDefault(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "90000"));

        DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream())
        {
            private final KafkaOffsetCommitter kafkaOffsetCommitter = new KafkaOffsetCommitter(
                    kafkaCluster,
                    groupId,
                    commitInterval);

            @Override
            public void initialize(Time time)
            {
                super.initialize(time);
                kafkaOffsetCommitter.setName("Kafka_Offset_Committer");
                kafkaOffsetCommitter.start();
            }

            @Override
            public void commitOffsets(RDD<?> kafkaRdd)
            {
                OffsetRange[] offsets = ((HasOffsetRanges) kafkaRdd).offsetRanges();
//                Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets)
//                        .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset()));
                //log().info("commit Kafka Offsets {}", internalOffsets);
                kafkaOffsetCommitter.addAll(offsets);
            }
        };
        JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = new JavaDStream<>(
                sylphKafkaOffset, ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class));
        return dStream;
//        inputStream = inputStream.transform(rdd -> {
//            OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
//            Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets)
//                    .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset()));
//            commitKafkaOffsets(kafkaCluster, groupId, internalOffsets);
//            return rdd;
//        });
    }

Example #19

Source File: KafkaSource08.java From sylph with Apache License 2.0

4 votes

public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig08 config, SourceContext context)
{
    String topics = requireNonNull(config.getTopics(), "topics not setting");
    String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器
    String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字
    String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting");

    Map<String, String> otherConfig = config.getOtherConfig().entrySet()
            .stream()
            .filter(x -> x.getValue() != null)
            .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString()));

    Map<String, String> kafkaParams = new HashMap<>(otherConfig);
    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
    //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest   smallest

    //----get fromOffsets
    @SuppressWarnings("unchecked")
    scala.collection.immutable.Map<String, String> map = (scala.collection.immutable.Map<String, String>) Map$.MODULE$.apply(JavaConverters.mapAsScalaMapConverter(kafkaParams).asScala().toSeq());
    final KafkaCluster kafkaCluster = new KafkaCluster(map);
    Map<TopicAndPartition, Long> fromOffsets = getFromOffset(kafkaCluster, topics, groupId);

    //--- createDirectStream  DirectKafkaInputDStream.class
    org.apache.spark.api.java.function.Function<MessageAndMetadata<byte[], byte[]>, ConsumerRecord<byte[], byte[]>> messageHandler =
            mmd -> new ConsumerRecord<>(mmd.topic(), mmd.partition(), mmd.key(), mmd.message(), mmd.offset());
    @SuppressWarnings("unchecked")
    Class<ConsumerRecord<byte[], byte[]>> recordClass = (Class<ConsumerRecord<byte[], byte[]>>) ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class).runtimeClass();
    JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(ssc,
            byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class, recordClass,
            kafkaParams, fromOffsets,
            messageHandler
    );
    JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = settingCommit(inputStream, kafkaParams, kafkaCluster, groupId);

    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return dStream
                .map(record -> {
                    return jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset());
                });
    }
    else {
        StructType structType = schemaToSparkType(context.getSchema());
        return dStream
                .map(record -> {
                    String[] names = structType.names();
                    Object[] values = new Object[names.length];
                    for (int i = 0; i < names.length; i++) {
                        switch (names[i]) {
                            case "_topic":
                                values[i] = record.topic();
                                continue;
                            case "_message":
                                values[i] = new String(record.value(), UTF_8);
                                continue;
                            case "_key":
                                values[i] = new String(record.key(), UTF_8);
                                continue;
                            case "_partition":
                                values[i] = record.partition();
                                continue;
                            case "_offset":
                                values[i] = record.offset();
                            default:
                                values[i] = null;
                        }
                    }
                    return (Row) new GenericRowWithSchema(values, structType);
                });  //.window(Duration(10 * 1000))
    }
}

Example #20

Source File: StreamingRsvpsDStream.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI);

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<ConsumerRecord<String, String>> rsvpsWithGuestsStream =
                meetupStream.filter(f -> !f.value().contains("\"guests\":0"));

        rsvpsWithGuestsStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> r) -> {        
            MongoSpark.save(
                    r.map(
                        e -> Document.parse(e.value())
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });

        streamingContext.start();
        streamingContext.awaitTermination();    
    }

Example #21

Source File: SparkMLTrainingAndScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);                               

                JavaStreamingContext streamingContext = new JavaStreamingContext(conf,
                    new Duration(BATCH_DURATION_INTERVAL_MS));
                
                JavaInputDStream<ConsumerRecord<String, String>> meetupStream = 
                    KafkaUtils.createDirectStream(
                                streamingContext, 
				LocationStrategies.PreferConsistent(),
                                ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                    );

                JavaDStream<String> meetupStreamValues = 
		    meetupStream.map(v -> {                     
                        return v.value();
                    });

                // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn])
                // Where n is the number of features, y is a binary label, 
                // and n must be the same for train and test.
                // e.g. "(response, [group_lat, group_long])";
                JavaDStream<String> trainData = meetupStreamValues.map(e -> {
                        
                        JSONParser jsonParser = new JSONParser();
                        JSONObject json = (JSONObject)jsonParser.parse(e);

                        String result = "(" 
                            + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") 
                            + ((JSONObject)json.get("group")).get("group_lat") + "," 
                            + ((JSONObject)json.get("group")).get("group_lon")
                            + "])";
                        
                        return result;
                });

                trainData.print();

                JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse);
        
                StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD 
			= new StreamingLogisticRegressionWithSGD()
                            .setInitialWeights(Vectors.zeros(2));

                streamingLogisticRegressionWithSGD.trainOn(labeledPoints);

                JavaPairDStream<Double, Vector> values = 
			labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features()));

                streamingLogisticRegressionWithSGD.predictOnValues(values).print();

                // some time later, after outputs have completed
                meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
                    OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

                ((CanCommitOffsets) meetupStream.inputDStream())
                    .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
                });

                streamingContext.start();
                streamingContext.awaitTermination();
        }