Java Code Examples for org.apache.spark.streaming.api.java.JavaPairDStream#foreachRDD()

The following examples show how to use org.apache.spark.streaming.api.java.JavaPairDStream#foreachRDD() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkStreamingJob.java    From zipkin-sparkstreaming with Apache License 2.0 6 votes vote down vote up
static void streamSpansToStorage(
    JavaDStream<byte[]> stream,
    ReadSpans readSpans,
    AdjustAndConsumeSpansSharingTraceId adjustAndConsumeSpansSharingTraceId
) {
  JavaDStream<Span> spans = stream.flatMap(readSpans);

  // TODO: plug in some filter to drop spans regardless of trace ID
  // spans = spans.filter(spanFilter);

  JavaPairDStream<String, Iterable<Span>> tracesById = spans
      .mapToPair(s -> new Tuple2<>(Util.toLowerHex(s.traceIdHigh, s.traceId), s))
      .groupByKey();

  tracesById.foreachRDD(rdd -> {
    rdd.values().foreachPartition(adjustAndConsumeSpansSharingTraceId);
  });
}
 
Example 2
Source File: StreamingEngine.java    From spark-streaming-direct-kafka with Apache License 2.0 6 votes vote down vote up
public void start() {
    SparkConf sparkConf = getSparkConf();
    streamingContext = new JavaStreamingContext(sparkConf,
            Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec())));
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext);
    JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message()));

    pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data
    dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager()));
    streamingContext.start();
}
 
Example 3
Source File: BlurLoadSparkProcessor.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
public void run() throws IOException {
  SparkConf conf = new SparkConf();
  conf.setAppName(getAppName());
  conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER);
  JavaSparkUtil.packProjectJars(conf);
  setupSparkConf(conf);

  JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration());
  List<JavaDStream<T>> streamsList = getStreamsList(ssc);

  // Union all the streams if there is more than 1 stream
  JavaDStream<T> streams = unionStreams(ssc, streamsList);

  JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() {
    public Tuple2<String, RowMutation> call(T t) {
      RowMutation rowMutation = convert(t);
      return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation);
    }
  });

  pairDStream.foreachRDD(getFunction());

  ssc.start();
  ssc.awaitTermination();
}
 
Example 4
Source File: StreamingService.java    From cxf with Apache License 2.0 6 votes vote down vote up
private void processStreamOneWay(List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect OneWay " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new PrintOutputFunction(jssc));
        jssc.start();
    } catch (Exception ex) {
        // ignore
    }
}
 
Example 5
Source File: StreamingService.java    From cxf with Apache License 2.0 5 votes vote down vote up
private void processStream(AsyncResponse async, List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        SparkStreamingOutput streamOut = new SparkStreamingOutput(jssc);
        SparkStreamingListener sparkListener = new SparkStreamingListener(streamOut);
        jssc.addStreamingListener(sparkListener);

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new OutputFunction(streamOut));
        jssc.start();

        executor.execute(new SparkJob(async, sparkListener));
    } catch (Exception ex) {
        // the compiler does not allow to catch SparkException directly
        if (ex instanceof SparkException) {
            async.cancel(60);
        } else {
            async.resume(new WebApplicationException(ex));
        }
    }
}
 
Example 6
Source File: JavaDirectTalosStreamSuite.java    From galaxy-sdk-java with Apache License 2.0 5 votes vote down vote up
@Test
public void testDirectStream() throws InterruptedException {
    HashSet<String> sentMessages = new HashSet<String>();
    for (int i = 1; i <= 100; i++)
        sentMessages.add(i + "");
    talosTestUtils.sendMessagesAndWaitForReceive(topic, sentMessages);
    JavaPairDStream<String, String> stream = TalosUtils.createDirectStream(
            ssc, talosTestUtils.javaTalosParams(), talosTestUtils.credential(), new HashSet<String>() {{
                add(topic);
            }}
    );
    final Set<String> result = Collections.synchronizedSet(new HashSet<String>());
    stream.foreachRDD(
            new Function<JavaPairRDD<String, String>, Void>() {
                public Void call(JavaPairRDD<String, String> rdd) throws Exception {
                    Iterator<Tuple2<String, String>> iterator = rdd.collect().iterator();
                    while (iterator.hasNext()) {
                        result.add(iterator.next()._2);
                    }
                    return null;
                }
            }
    );
    ssc.start();
    long startTime = System.currentTimeMillis();
    boolean matches = false;
    while (!matches && System.currentTimeMillis() - startTime < 20000) {
        matches = sentMessages.size() == result.size();
        Thread.sleep(50);
    }
    Assert.assertEquals(sentMessages, result);
    ssc.stop();
}
 
Example 7
Source File: ProcessedOffsetManager.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static void persists(JavaPairDStream<String, Iterable<Long>> partitonOffset, Properties props) {
  partitonOffset.foreachRDD(new VoidFunction<JavaPairRDD<String,Iterable<Long>>>() {
    @Override
    public void call(JavaPairRDD<String, Iterable<Long>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}
 
Example 8
Source File: StateLess.java    From sparkResearch with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("StateLess");

    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

    JavaReceiverInputDStream<String> inputDStream = streamingContext.socketTextStream("localhost", 8080);

    JavaDStream<String> dStream = inputDStream.flatMap((FlatMapFunction<String, String>) s -> Arrays.asList(SPACE.split(s)).iterator());

    JavaPairDStream<String, Integer> pairDStream = dStream.mapToPair(new LogTuple());

    JavaPairDStream<String, Integer> result = pairDStream.reduceByKey(new ReduceIsKey());

    //JOIN
    JavaPairDStream<String, Integer> pairDStream1 = dStream.mapToPair(new LogTuple());

    JavaPairDStream<String, Integer> result1 = pairDStream.reduceByKey(new ReduceIsKey());

    JavaPairDStream<String, Tuple2<Integer, Integer>> c = result.join(result);


    result.foreachRDD(rdd -> {
        rdd.foreachPartition(partitionOfRecords -> {
            Connection connection = ConnectionPool.getConnection();
            Tuple2<String, Integer> wordCount;
            while (partitionOfRecords.hasNext()) {
                wordCount = partitionOfRecords.next();
                String sql = "insert into wordcount(word,count) " + "values('" + wordCount._1 + "',"
                        + wordCount._2 + ")";
                Statement stmt = connection.createStatement();
                stmt.executeUpdate(sql);
            }
            ConnectionPool.returnConnection(connection);
        });
    });

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
        streamingContext.close();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

}
 
Example 9
Source File: KafkaExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
 System.setProperty("hadoop.home.dir", "E:\\hadoop");
  	//Logger rootLogger = LogManager.getRootLogger();
 		//rootLogger.setLevel(Level.WARN); 
      SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");    
      JavaSparkContext sc = new JavaSparkContext(conf);
      JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
      streamingContext.checkpoint("E:\\hadoop\\checkpoint");
      Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
      Map<String, Object> kafkaParams = new HashMap<>();
      kafkaParams.put("bootstrap.servers", "10.0.75.1:9092");
      kafkaParams.put("key.deserializer", StringDeserializer.class);
      kafkaParams.put("value.deserializer", StringDeserializer.class);
      kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea");
      kafkaParams.put("auto.offset.reset", "latest");
     // kafkaParams.put("enable.auto.commit", false);

      Collection<String> topics = Arrays.asList("mytopic", "anothertopic");

      final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(),
      				ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));

      JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value()));
     
      pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));});
     
      JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText());
      
      tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x)));
      
     JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() );
 
      hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x)));
      
      JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue();
      
      cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30))
                .countByValue()
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30))
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2())));
      */
     hashtagRDD.window(Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     
     /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     
      streamingContext.start();
      try {
	streamingContext.awaitTermination();
} catch (InterruptedException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
  }
 
Example 10
Source File: JavaRecoverableNetworkWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
private static JavaStreamingContext createContext(String ip,
                                                  int port,
                                                  String checkpointDirectory,
                                                  String outputPath) {

  // If you do not see this printed, that means the StreamingContext has been loaded
  // from the new checkpoint
  System.out.println("Creating new context");
  final File outputFile = new File(outputPath);
  if (outputFile.exists()) {
    outputFile.delete();
  }
  SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount");
  // Create the context with a 1 second batch size
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  ssc.checkpoint(checkpointDirectory);

  // Create a socket stream on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port);
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override
      public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() {
    @Override
    public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException {
      // Get or register the blacklist Broadcast
      final Broadcast<List<String>> blacklist =
          JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context()));
      // Get or register the droppedWordsCounter Accumulator
      final LongAccumulator droppedWordsCounter =
          JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context()));
      // Use blacklist to drop words and use droppedWordsCounter to count them
      String counts = rdd.filter(new Function<Tuple2<String, Integer>, Boolean>() {
        @Override
        public Boolean call(Tuple2<String, Integer> wordCount) {
          if (blacklist.value().contains(wordCount._1())) {
            droppedWordsCounter.add(wordCount._2());
            return false;
          } else {
            return true;
          }
        }
      }).collect().toString();
      String output = "Counts at time " + time + " " + counts;
      System.out.println(output);
      System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally");
      System.out.println("Appending to " + outputFile.getAbsolutePath());
      Files.append(output + "\n", outputFile, Charset.defaultCharset());
    }
  });

  return ssc;
}
 
Example 11
Source File: SpeedLayer.java    From oryx with Apache License 2.0 4 votes vote down vote up
public synchronized void start() {
  String id = getID();
  if (id != null) {
    log.info("Starting Speed Layer {}", id);
  }

  streamingContext = buildStreamingContext();
  log.info("Creating message stream from topic");
  JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext);
  JavaPairDStream<K,M> pairDStream =
      kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value()));

  KafkaConsumer<String,U> consumer = new KafkaConsumer<>(
      ConfigUtils.keyValueToProperties(
          "group.id", "OryxGroup-" + getLayerName() + '-' + UUID.randomUUID(),
          "bootstrap.servers", updateBroker,
          "max.partition.fetch.bytes", maxMessageSize,
          "key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer",
          "value.deserializer", updateDecoderClass.getName(),
          // Do start from the beginning of the update queue
          "auto.offset.reset", "earliest"
      ));
  consumer.subscribe(Collections.singletonList(updateTopic));
  consumerIterator = new ConsumeDataIterator<>(consumer);

  modelManager = loadManagerInstance();
  Configuration hadoopConf = streamingContext.sparkContext().hadoopConfiguration();
  new Thread(LoggingCallable.log(() -> {
    try {
      modelManager.consume(consumerIterator, hadoopConf);
    } catch (Throwable t) {
      log.error("Error while consuming updates", t);
      close();
    }
  }).asRunnable(), "OryxSpeedLayerUpdateConsumerThread").start();

  pairDStream.foreachRDD(new SpeedLayerUpdate<>(modelManager, updateBroker, updateTopic));

  // Must use the raw Kafka stream to get offsets
  kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster()));

  log.info("Starting Spark Streaming");

  streamingContext.start();
}
 
Example 12
Source File: BatchLayer.java    From oryx with Apache License 2.0 4 votes vote down vote up
public synchronized void start() {
  String id = getID();
  if (id != null) {
    log.info("Starting Batch Layer {}", id);
  }

  streamingContext = buildStreamingContext();
  JavaSparkContext sparkContext = streamingContext.sparkContext();
  Configuration hadoopConf = sparkContext.hadoopConfiguration();

  Path checkpointPath = new Path(new Path(modelDirString), ".checkpoint");
  log.info("Setting checkpoint dir to {}", checkpointPath);
  sparkContext.setCheckpointDir(checkpointPath.toString());

  log.info("Creating message stream from topic");
  JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext);
  JavaPairDStream<K,M> pairDStream =
      kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value()));

  Class<K> keyClass = getKeyClass();
  Class<M> messageClass = getMessageClass();
  pairDStream.foreachRDD(
      new BatchUpdateFunction<>(getConfig(),
                                keyClass,
                                messageClass,
                                keyWritableClass,
                                messageWritableClass,
                                dataDirString,
                                modelDirString,
                                loadUpdateInstance(),
                                streamingContext));

  // "Inline" saveAsNewAPIHadoopFiles to be able to skip saving empty RDDs
  pairDStream.foreachRDD(new SaveToHDFSFunction<>(
      dataDirString + "/oryx",
      "data",
      keyClass,
      messageClass,
      keyWritableClass,
      messageWritableClass,
      hadoopConf));

  // Must use the raw Kafka stream to get offsets
  kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster()));

  if (maxDataAgeHours != NO_MAX_AGE) {
    pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf,
                                                 dataDirString,
                                                 Pattern.compile("-(\\d+)\\."),
                                                 maxDataAgeHours));
  }
  if (maxModelAgeHours != NO_MAX_AGE) {
    pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf,
                                                 modelDirString,
                                                 Pattern.compile("(\\d+)"),
                                                 maxModelAgeHours));
  }

  log.info("Starting Spark Streaming");

  streamingContext.start();
}
 
Example 13
Source File: Server.java    From cxf with Apache License 2.0 4 votes vote down vote up
protected Server(String[] args) throws Exception {

        ServerSocket sparkServerSocket = new ServerSocket(9999);
        ServerSocket jaxrsResponseServerSocket = new ServerSocket(10000);
        Socket jaxrsResponseClientSocket = new Socket("localhost", 10000);


        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Socket Connect");
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        SparkStreamingOutput streamOut = new SparkStreamingOutput(jssc);
        SparkStreamingListener sparkListener = new SparkStreamingListener(streamOut);
        jssc.addStreamingListener(sparkListener);

        JavaDStream<String> receiverStream = jssc.socketTextStream(
            "localhost", 9999, StorageLevels.MEMORY_ONLY);

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, true);
        PrintStream sparkResponseOutputStream = new PrintStream(jaxrsResponseClientSocket.getOutputStream(), true);
        wordCounts.foreachRDD(new SocketOutputFunction(sparkResponseOutputStream));

        jssc.start();

        Socket receiverClientSocket = sparkServerSocket.accept();
        PrintStream sparkOutputStream = new PrintStream(receiverClientSocket.getOutputStream(), true);
        BufferedReader sparkInputStream =
            new BufferedReader(new InputStreamReader(jaxrsResponseServerSocket.accept().getInputStream()));


        JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();

        sf.setResourceClasses(StreamingService.class);
        sf.setResourceProvider(StreamingService.class,
            new SingletonResourceProvider(new StreamingService(sparkInputStream,
                                                                     sparkOutputStream)));
        sf.setAddress("http://localhost:9000/spark");
        sf.create();

        jssc.awaitTermination();
        sparkServerSocket.close();
        jaxrsResponseServerSocket.close();
        jaxrsResponseClientSocket.close();

    }