org.apache.spark.streaming.api.java.JavaDStream Java Examples

The following examples show how to use org.apache.spark.streaming.api.java.JavaDStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #2
Source File: ComputeStreamingResponse.java    From incubator-retired-pirk with Apache License 2.0 6 votes vote down vote up
/**
 * Method to read in data from an allowed input source/format and perform the query
 */
public void performQuery() throws IOException, PIRException
{
  logger.info("Performing query: ");

  JavaDStream<MapWritable> inputRDD = null;
  if (dataInputFormat.equals(InputFormatConst.BASE_FORMAT))
  {
    inputRDD = readData();
  }
  else if (dataInputFormat.equals(InputFormatConst.ES))
  {
    inputRDD = readDataES();
  }
  else
  {
    throw new PIRException("Unknown data input format " + dataInputFormat);
  }

  performQuery(inputRDD);
}
 
Example #3
Source File: DStreamUtil.java    From sylph with Apache License 2.0 6 votes vote down vote up
public static void dstreamAction(JavaDStream<Row> stream, Sink<JavaRDD<Row>> sink)
{
    DStream<?> fristDStream = getFristDStream(stream.dstream());
    logger.info("数据源驱动:{}", fristDStream.getClass().getName());

    if ("DirectKafkaInputDStream".equals(fristDStream.getClass().getSimpleName())) {
        logger.info("发现job 数据源是kafka,将开启空job优化 且 自动上报offect");
        stream.foreachRDD(rdd -> {
            RDD<?> kafkaRdd = getFristRdd(rdd.rdd()); //rdd.dependencies(0).rdd
            OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges();
            if (kafkaRdd.count() > 0) {
                sink.run(rdd); //执行业务操作
            }
            ((CanCommitOffsets) fristDStream).commitAsync(offsetRanges);
        });
    }
    else { //非kafka数据源 暂时无法做任何优化
        stream.foreachRDD(sink::run);
    }
}
 
Example #4
Source File: SparkStreamingJob.java    From zipkin-sparkstreaming with Apache License 2.0 6 votes vote down vote up
static void streamSpansToStorage(
    JavaDStream<byte[]> stream,
    ReadSpans readSpans,
    AdjustAndConsumeSpansSharingTraceId adjustAndConsumeSpansSharingTraceId
) {
  JavaDStream<Span> spans = stream.flatMap(readSpans);

  // TODO: plug in some filter to drop spans regardless of trace ID
  // spans = spans.filter(spanFilter);

  JavaPairDStream<String, Iterable<Span>> tracesById = spans
      .mapToPair(s -> new Tuple2<>(Util.toLowerHex(s.traceIdHigh, s.traceId), s))
      .groupByKey();

  tracesById.foreachRDD(rdd -> {
    rdd.values().foreachPartition(adjustAndConsumeSpansSharingTraceId);
  });
}
 
Example #5
Source File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new RowProcessor());

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example #6
Source File: SparkStreamServiceImpl.java    From searchanalytics-bigdata with MIT License 6 votes vote down vote up
@Override
	public void startFlumeStream() {
		JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(
				jssc, "localhost", 41111, StorageLevels.MEMORY_AND_DISK);

		QueryStringJDStreams queryStringJDStreams = new QueryStringJDStreams();

		// Run top top search query string stream
		queryStringJDStreams
				.topQueryStringsCountInLastOneHourUsingSparkFlumeEvent(flumeStream);

		// Run top product view stream
		//TODO: uncomment to get both stats.
//		queryStringJDStreams
//				.topProductViewsCountInLastOneHourUsingSparkFlumeEvent(flumeStream);
		jssc.start();
	}
 
Example #7
Source File: StreamingRsvpsDStreamCountWindow.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 6 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI)
                .set("spark.streaming.kafka.consumer.cache.enabled", "false");

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        streamingContext.checkpoint(CHECKPOINT_FOLDER);

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<Long> countStream  
            = meetupStream.countByWindow(
                 new Duration(WINDOW_LENGTH_MS), 
                 new Duration(SLIDING_INTERVAL_MS));        

        countStream.foreachRDD((JavaRDD<Long> countRDD) -> {                
            MongoSpark.save(        
                    countRDD.map(
                        r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}")
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });
        
        streamingContext.start();
        streamingContext.awaitTermination();    
    }
 
Example #8
Source File: ReduceByKeyAndWindow.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #9
Source File: StreamingService.java    From cxf with Apache License 2.0 6 votes vote down vote up
private void processStreamOneWay(List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect OneWay " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new PrintOutputFunction(jssc));
        jssc.start();
    } catch (Exception ex) {
        // ignore
    }
}
 
Example #10
Source File: JavaCustomReceiver.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}
 
Example #11
Source File: SparkStreamingFromNetworkExample.java    From SparkOnALog with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  if (args.length < 3) {
    System.err.println("Usage: NetworkWordCount <master> <hostname> <port>\n" +
        "In local mode, <master> should be 'local[n]' with n > 1");
    System.exit(1);
  }

  // Create the context with a 1 second batch size
  JavaStreamingContext ssc = new JavaStreamingContext(args[0], "NetworkWordCount",
          new Duration(5000), System.getenv("SPARK_HOME"), System.getenv("SPARK_EXAMPLES_JAR"));

  // Create a NetworkInputDStream on target ip:port and count the
  // words in input stream of \n delimited test (eg. generated by 'nc')
  JavaDStream<String> lines = ssc.socketTextStream(args[1], Integer.parseInt(args[2]));
  
  lines.map(new Function<String, String> () {

@Override
public String call(String arg0) throws Exception {
	System.out.println("arg0" + arg0);
	return arg0;
}}).print();
  
  lines.print();
  ssc.start();


}
 
Example #12
Source File: RealtimeTrafficDataProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 *
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

    // reduce by key and window (30 sec window and 10 sec slide).
    JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
            .mapToPair(iot -> new Tuple2<>(
                    new AggregateKey(iot.getRouteId(), iot.getVehicleType()),
                    1L
            ))
            .reduceByKeyAndWindow((a, b) -> a + b,
                    Durations.seconds(30),
                    Durations.seconds(10)
            );

    // Transform to dstream of TrafficData
    JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("routeId", "routeid");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("totalCount", "totalcount");
    columnNameMappings.put("timeStamp", "timestamp");
    columnNameMappings.put("recordDate", "recorddate");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream).writerBuilder(
            "traffickeyspace",
            "window_traffic",
            CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)
    ).saveToCassandra();
}
 
Example #13
Source File: AbstractJavaEsSparkStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testEsRDDWriteWithMappingExclude() throws Exception {
    Map<String, Object> trip1 = new HashMap<>();
    trip1.put("reason", "business");
    trip1.put("airport", "SFO");

    Map<String, Object> trip2 = new HashMap<>();
    trip2.put("participants", 5);
    trip2.put("airport", "OTP");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(trip1);
    docs.add(trip2);

    String target = wrapIndex(resource("spark-test-scala-write-exclude", "data", version));

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put(ES_MAPPING_EXCLUDE, "airport");

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertTrue(RestUtils.exists(target));
    assertThat(RestUtils.get(target + "/_search?"), containsString("business"));
    assertThat(RestUtils.get(target +  "/_search?"), containsString("participants"));
    assertThat(RestUtils.get(target +  "/_search?"), not(containsString("airport")));
}
 
Example #14
Source File: WordCountSocketStateful.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 System.setProperty("hadoop.home.dir", "E:\\hadoop");

   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   streamingContext.checkpoint("E:\\hadoop\\checkpoint");
// Initial state RDD input to mapWithState
   @SuppressWarnings("unchecked")
   List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
   
   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  


  // Update the cumulative count function
  Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc =
      new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
        @Override
        public Tuple2<String, Integer> call(String word, Optional<Integer> one,
            State<Integer> state) {
          int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
          Tuple2<String, Integer> output = new Tuple2<>(word, sum);
          state.update(sum);
          return output;
        }
      };

  // DStream made of get cumulative counts that get updated in every batch
  JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

  stateDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example #15
Source File: FileStreamingEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) {
   	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
   	//Logger rootLogger = LogManager.getRootLogger();
  		//rootLogger.setLevel(Level.WARN); 
       SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
       String inputDirectory="E:\\hadoop\\streamFolder\\";
    
       JavaSparkContext sc = new JavaSparkContext(conf);
       JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1));
      // streamingContext.checkpoint("E:\\hadoop\\checkpoint");
       Logger rootLogger = LogManager.getRootLogger();
  		rootLogger.setLevel(Level.WARN); 
  		
  		JavaDStream<String> streamfile = streamingContext.textFileStream(inputDirectory);
  		streamfile.print();
  		streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x)));
  		
  			   		
  		JavaPairDStream<LongWritable, Text> streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class);
  	 streamedFile.print();
  		
  	 streamingContext.start();
  	 

       try {
		streamingContext.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example #16
Source File: StreamingTransformTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, V, W extends BoundedWindow> TransformEvaluator<Reshuffle<K, V>> reshuffle() {
  return new TransformEvaluator<Reshuffle<K, V>>() {
    @Override
    public void evaluate(Reshuffle<K, V> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked")
      UnboundedDataset<KV<K, V>> inputDataset =
          (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
      List<Integer> streamSources = inputDataset.getStreamSources();
      JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
      final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
      @SuppressWarnings("unchecked")
      final WindowingStrategy<?, W> windowingStrategy =
          (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();

      final WindowedValue.WindowedValueCoder<KV<K, V>> wvCoder =
          WindowedValue.FullWindowedValueCoder.of(coder, windowFn.windowCoder());

      JavaDStream<WindowedValue<KV<K, V>>> reshuffledStream =
          dStream.transform(rdd -> GroupCombineFunctions.reshuffle(rdd, wvCoder));

      context.putDataset(transform, new UnboundedDataset<>(reshuffledStream, streamSources));
    }

    @Override
    public String toNativeString() {
      return "repartition(...)";
    }
  };
}
 
Example #17
Source File: AbstractJavaEsSparkStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultiIndexRDDWrite() throws Exception {
    Map<String, Object> trip1 = new HashMap<>();
    trip1.put("reason", "business");
    trip1.put("airport", "sfo");

    Map<String, Object> trip2 = new HashMap<>();
    trip2.put("participants", 5);
    trip2.put("airport", "otp");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(trip1);
    docs.add(trip2);

    String target = wrapIndex(resource("spark-test-trip-{airport}", "data", version));

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);
    JavaEsSparkStreaming.saveToEs(dstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertTrue(RestUtils.exists(wrapIndex(resource("spark-test-trip-otp", "data", version))));
    assertTrue(RestUtils.exists(wrapIndex(resource("spark-test-trip-sfo", "data", version))));

    assertThat(RestUtils.get(wrapIndex(resource("spark-test-trip-sfo", "data", version) + "/_search?")), containsString("business"));
    assertThat(RestUtils.get(wrapIndex(resource("spark-test-trip-otp", "data", version) + "/_search?")), containsString("participants"));
}
 
Example #18
Source File: WordCountSocketJava8Ex.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example #19
Source File: WordCountRecoverableEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
	SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
	JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
	streamingContext.checkpoint(checkpointDirectory);
	// Initial state RDD input to mapWithState
	@SuppressWarnings("unchecked")
	List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
	JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);

	JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);

	JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
			.reduceByKey((count1, count2) -> count1 + count2);

	// Update the cumulative count function
	Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
		@Override
		public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
			int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
			Tuple2<String, Integer> output = new Tuple2<>(word, sum);
			state.update(sum);
			return output;
		}
	};

	// DStream made of get cumulative counts that get updated in every batch
	JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
			.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

	stateDstream.print();
	return streamingContext;
}
 
Example #20
Source File: SparkGroupAlsoByWindowViaWindowSet.java    From beam with Apache License 2.0 5 votes vote down vote up
public static <K, InputT, W extends BoundedWindow>
    JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> groupByKeyAndWindow(
        final JavaDStream<WindowedValue<KV<K, InputT>>> inputDStream,
        final Coder<K> keyCoder,
        final Coder<WindowedValue<InputT>> wvCoder,
        final WindowingStrategy<?, W> windowingStrategy,
        final SerializablePipelineOptions options,
        final List<Integer> sourceIds,
        final String transformFullName) {

  final PairDStreamFunctions<ByteArray, byte[]> pairDStream =
      buildPairDStream(inputDStream, keyCoder, wvCoder);

  // use updateStateByKey to scan through the state and update elements and timers.
  final UpdateStateByKeyFunction<K, InputT, W> updateFunc =
      new UpdateStateByKeyFunction<>(
          sourceIds,
          windowingStrategy,
          (FullWindowedValueCoder<InputT>) wvCoder,
          keyCoder,
          options,
          transformFullName);

  final DStream<
          Tuple2</*K*/ ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/ List<byte[]>>>>
      firedStream =
          pairDStream.updateStateByKey(
              updateFunc,
              pairDStream.defaultPartitioner(pairDStream.defaultPartitioner$default$1()),
              true,
              JavaSparkContext$.MODULE$.fakeClassTag());

  checkpointIfNeeded(firedStream, options);

  // filter state-only output (nothing to fire) and remove the state from the output.
  return stripStateValues(firedStream, keyCoder, (FullWindowedValueCoder<InputT>) wvCoder);
}
 
Example #21
Source File: TestStreamingStep.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public JavaDStream<?> getDStream() throws Exception {
  Queue<JavaRDD<String>> queue = new LinkedList<>();
  queue.add(generateRDD());
  JavaDStream<String> dstream = Contexts.getJavaStreamingContext().queueStream(queue) ;
  return dstream;
}
 
Example #22
Source File: AbstractJavaEsSparkStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testEsRDDWriteWithMappingExclude() throws Exception {
    Map<String, Object> trip1 = new HashMap<>();
    trip1.put("reason", "business");
    trip1.put("airport", "SFO");

    Map<String, Object> trip2 = new HashMap<>();
    trip2.put("participants", 5);
    trip2.put("airport", "OTP");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(trip1);
    docs.add(trip2);

    String target = wrapIndex(resource("spark-streaming-test-scala-write-exclude", "data", version));

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put(ES_MAPPING_EXCLUDE, "airport");

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertTrue(RestUtils.exists(target));
    assertThat(RestUtils.get(target + "/_search?"), containsString("business"));
    assertThat(RestUtils.get(target +  "/_search?"), containsString("participants"));
    assertThat(RestUtils.get(target +  "/_search?"), not(containsString("airport")));
}
 
Example #23
Source File: SparkGroupAlsoByWindowViaWindowSet.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, InputT> PairDStreamFunctions<ByteArray, byte[]> buildPairDStream(
    final JavaDStream<WindowedValue<KV<K, InputT>>> inputDStream,
    final Coder<K> keyCoder,
    final Coder<WindowedValue<InputT>> wvCoder) {

  // we have to switch to Scala API to avoid Optional in the Java API, see: SPARK-4819.
  // we also have a broader API for Scala (access to the actual key and entire iterator).
  // we use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle and be in serialized form
  // for checkpointing.
  // for readability, we add comments with actual type next to byte[].
  // to shorten line length, we use:
  // ---- WV: WindowedValue
  // ---- Iterable: Itr
  // ---- AccumT: A
  // ---- InputT: I
  final DStream<Tuple2<ByteArray, byte[]>> tupleDStream =
      inputDStream
          .map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder))
          .dstream();

  return DStream.toPairDStreamFunctions(
      tupleDStream,
      JavaSparkContext$.MODULE$.fakeClassTag(),
      JavaSparkContext$.MODULE$.fakeClassTag(),
      null);
}
 
Example #24
Source File: KafkaStreamFactory.java    From zipkin-sparkstreaming with Apache License 2.0 5 votes vote down vote up
@Override public JavaDStream<byte[]> create(JavaStreamingContext jsc) {
  return KafkaUtils.createDirectStream(
      jsc,
      byte[].class,
      byte[].class,
      DefaultDecoder.class,
      DefaultDecoder.class,
      kafkaParams(),
      Collections.singleton(topic()))
      .map(m -> m._2); // get value
}
 
Example #25
Source File: ComputeStreamingResponse.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
/**
 * Method to perform the query given an input JavaDStream of JSON
 * 
 */
public void performQuery(JavaDStream<MapWritable> input)
{
  logger.info("Performing query: ");

  // Process non-overlapping windows of data of duration windowLength seconds
  // If we are using queue streams, there is no need to window
  if (!useQueueStream)
  {
    input.window(Durations.seconds(windowLength), Durations.seconds(windowLength));
  }

  // Extract the selectors for each dataElement based upon the query type
  // and perform a keyed hash of the selectors
  JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars));

  // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking...
  JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey();

  // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row
  JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars));

  // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object
  encryptedColumnCalc(encRowRDD);

  // Start the streaming computation
  start();
}
 
Example #26
Source File: KafkaInput.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public JavaDStream<?> getDStream() throws Exception {
  if (dStream == null) {
    JavaStreamingContext jssc = Contexts.getJavaStreamingContext();
    Map<TopicPartition, Long> lastOffsets = null;
    if (doesRecordProgress(config) && !usingKafkaManagedOffsets(config)) {
      lastOffsets = getLastOffsets();
    }

    if (lastOffsets != null) {
      dStream = KafkaUtils.createDirectStream(jssc, LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams, lastOffsets));
    } else {
      dStream = KafkaUtils.createDirectStream(jssc, LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams));
    }

    if (ConfigUtils.getOrElse(config, WINDOW_ENABLED_CONFIG, false)) {
      int windowDuration = config.getInt(WINDOW_MILLISECONDS_CONFIG);
      if (config.hasPath(WINDOW_SLIDE_MILLISECONDS_CONFIG)) {
        int slideDuration = config.getInt(WINDOW_SLIDE_MILLISECONDS_CONFIG);
        dStream = dStream.window(new Duration(windowDuration), new Duration(slideDuration));
      } else {
        dStream = dStream.window(new Duration(windowDuration));
      }
    }
  }

  return dStream;
}
 
Example #27
Source File: ProcessedOffsetManager.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) {
  ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag =
      ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag();
  JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset =
      new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag);
  jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() {
    @Override
    public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}
 
Example #28
Source File: DummyStreamInput.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public JavaDStream<Long> getDStream() throws Exception {
  List<Long> list = Lists.newArrayList();
  for (int i = 0; i < rowsPerBatch; i++) {
    list.add(counter++);
  }
  JavaRDD<Long> longs = Contexts.getJavaStreamingContext().sparkContext().parallelize(list);
  Queue<JavaRDD<Long>> queue = Queues.newLinkedBlockingQueue();
  queue.add(longs);
  LOG.info("Created stream queue with {} rows", list.size());
  return Contexts.getJavaStreamingContext().queueStream(queue, true);
}
 
Example #29
Source File: SparkUtils.java    From cxf with Apache License 2.0 5 votes vote down vote up
public static JavaPairDStream<String, Integer> createOutputDStream(
    JavaDStream<String> receiverStream, boolean withId) {
    final JavaDStream<String> words =
        receiverStream.flatMap(x -> withId ? splitInputStringWithId(x) : splitInputString(x));

    final JavaPairDStream<String, Integer> pairs = words.mapToPair(s -> {
        return new Tuple2<String, Integer>(s, 1);
    });
    return pairs.reduceByKey((i1, i2) -> {
        return i1 + i2;
    });
}
 
Example #30
Source File: IoTTrafficDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 * 
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) {
	 
	// Filter by routeId,vehicleType and in POI range
	JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
			.filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
					&& iot.getVehicleType().contains(broadcastPOIValues.value()._3())
					&& GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
							Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
							broadcastPOIValues.value()._1().getLongitude(),
							broadcastPOIValues.value()._1().getRadius())));

	// pair with poi
	JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered
			.mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()));

	// Transform to dstream of POITrafficData
	JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("vehicleId", "vehicleid");
	columnNameMappings.put("distance", "distance");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("timeStamp", "timestamp");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream)
			.writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings))
			.withConstantTTL(120)//keeping data for 2 minutes
			.saveToCassandra();
}