org.apache.spark.streaming.Durations Java Examples

The following examples show how to use org.apache.spark.streaming.Durations. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #2
Source File: StreamingEngine.java    From spark-streaming-direct-kafka with Apache License 2.0 6 votes vote down vote up
public void start() {
    SparkConf sparkConf = getSparkConf();
    streamingContext = new JavaStreamingContext(sparkConf,
            Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec())));
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext);
    JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message()));

    pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data
    dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager()));
    streamingContext.start();
}
 
Example #3
Source File: SparkStreamDemo.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //创建两个核心的本地线程,批处理的间隔为1秒
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamIng");
    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Durations.seconds(1));
    //创建一个连接到IP:localhost,PORT:8080的DStream
    JavaReceiverInputDStream<String> dStream = javaStreamingContext.socketTextStream("localhost", 8080);
    JavaDStream<String> errorLine = dStream.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return v1.contains("error");
        }
    });
    //打印包含error的行
    errorLine.print();
    try {
        //开始计算
        javaStreamingContext.start();
        //等待计算完成
        javaStreamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #4
Source File: Window.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("window").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");

    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaDStream<String> winDstream = dStream.window(Durations.seconds(30), Durations.seconds(20));

    JavaDStream<Long> result = winDstream.count();

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #5
Source File: StreamingService.java    From cxf with Apache License 2.0 6 votes vote down vote up
private void processStreamOneWay(List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect OneWay " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new PrintOutputFunction(jssc));
        jssc.start();
    } catch (Exception ex) {
        // ignore
    }
}
 
Example #6
Source File: ReduceByKeyAndWindow.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #7
Source File: StreamingProcessor.java    From lambda-arch with Apache License 2.0 6 votes vote down vote up
private JavaDStream<IoTData> getVehicleNotProcessed(JavaDStream<IoTData> nonFilteredIotDataStream) {
    //We need filtered stream for total and traffic data calculation
    JavaPairDStream<String, IoTData> iotDataPairStream = nonFilteredIotDataStream
            .mapToPair(iot -> new Tuple2<>(iot.getVehicleId(), iot))
            .reduceByKey((a, b) -> a);

    // Check vehicle Id is already processed
    JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData, Boolean>> iotDStreamWithStatePairs =
            iotDataPairStream
                    .mapWithState(
                            StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600))
                    );//maintain state for one hour

    // Filter processed vehicle ids and keep un-processed
    JavaDStream<Tuple2<IoTData, Boolean>> filteredIotDStreams = iotDStreamWithStatePairs
            .filter(tuple -> tuple._2.equals(Boolean.FALSE));

    // Get stream of IoTdata
    return filteredIotDStreams.map(tuple -> tuple._1);
}
 
Example #8
Source File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new RowProcessor());

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example #9
Source File: IoTTrafficDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 6 votes vote down vote up
/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 * 
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

	// reduce by key and window (30 sec window and 10 sec slide).
	JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
			.mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
			.reduceByKeyAndWindow((a, b) -> a + b, Durations.seconds(30), Durations.seconds(10));

	// Transform to dstream of TrafficData
	JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("routeId", "routeid");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("totalCount", "totalcount");
	columnNameMappings.put("timeStamp", "timestamp");
	columnNameMappings.put("recordDate", "recorddate");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "window_traffic",
			CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)).saveToCassandra();
}
 
Example #10
Source File: StreamingIngestionFileSystemTextFileApp.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "NetworkWordCount");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());
  msgDataStream.print();

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example #11
Source File: StreamingContextConfigurationTests.java    From Decision with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws Exception {
    System.clearProperty("spark.driver,port");
    System.clearProperty("spark.hostPort");

    conf = new SparkConf().setMaster("local[4]").setAppName("magic");
    ssc = new JavaStreamingContext(conf, Durations.seconds(1));
    //        TODO simulate a configurationContext
    sc = streamingContextConfiguration.streamingContext();
    //sc.start();
    //        sc.ssc().conf();
    //        sc.start();
    //        ssc.start();

}
 
Example #12
Source File: WordCountSocketStateful.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 System.setProperty("hadoop.home.dir", "E:\\hadoop");

   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   streamingContext.checkpoint("E:\\hadoop\\checkpoint");
// Initial state RDD input to mapWithState
   @SuppressWarnings("unchecked")
   List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
   
   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  


  // Update the cumulative count function
  Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc =
      new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
        @Override
        public Tuple2<String, Integer> call(String word, Optional<Integer> one,
            State<Integer> state) {
          int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
          Tuple2<String, Integer> output = new Tuple2<>(word, sum);
          state.update(sum);
          return output;
        }
      };

  // DStream made of get cumulative counts that get updated in every batch
  JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

  stateDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example #13
Source File: WordCountSocketJava8Ex.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example #14
Source File: WordCountRecoverableEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
	SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
	JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
	streamingContext.checkpoint(checkpointDirectory);
	// Initial state RDD input to mapWithState
	@SuppressWarnings("unchecked")
	List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
	JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);

	JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);

	JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
			.reduceByKey((count1, count2) -> count1 + count2);

	// Update the cumulative count function
	Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
		@Override
		public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
			int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
			Tuple2<String, Integer> output = new Tuple2<>(word, sum);
			state.update(sum);
			return output;
		}
	};

	// DStream made of get cumulative counts that get updated in every batch
	JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
			.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

	stateDstream.print();
	return streamingContext;
}
 
Example #15
Source File: IoTTrafficDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get total traffic counts of different type of vehicles for each route.
 * 
 * @param filteredIotDataStream IoT data stream
 */
public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

	// We need to get count of vehicle group by routeId and vehicleType
	JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
			.mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
			.reduceByKey((a, b) -> a + b);
	
	// Need to keep state for total count
	JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair = countDStreamPair
			.mapWithState(StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600)));//maintain state for one hour

	// Transform to dstream of TrafficData
	JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2);
	JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("routeId", "routeid");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("totalCount", "totalcount");
	columnNameMappings.put("timeStamp", "timestamp");
	columnNameMappings.put("recordDate", "recorddate");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "total_traffic",
			CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings)).saveToCassandra();
}
 
Example #16
Source File: SparkBatchProcessingTest.java    From OSTMap with Apache License 2.0 5 votes vote down vote up
private static StreamingContext createSparkStreamingContext(){
    SparkConf conf = new SparkConf()
            .setAppName("Spark Batch Processing Test")
            .set("spark.serializer", KryoSerializer.class.getCanonicalName())
            .set("spark.eventLog.enabled", "true");
    return new StreamingContext(conf, Durations.seconds(15));
}
 
Example #17
Source File: ComputeStreamingResponse.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
/**
 * Method to perform the query given an input JavaDStream of JSON
 * 
 */
public void performQuery(JavaDStream<MapWritable> input)
{
  logger.info("Performing query: ");

  // Process non-overlapping windows of data of duration windowLength seconds
  // If we are using queue streams, there is no need to window
  if (!useQueueStream)
  {
    input.window(Durations.seconds(windowLength), Durations.seconds(windowLength));
  }

  // Extract the selectors for each dataElement based upon the query type
  // and perform a keyed hash of the selectors
  JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars));

  // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking...
  JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey();

  // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row
  JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars));

  // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object
  encryptedColumnCalc(encRowRDD);

  // Start the streaming computation
  start();
}
 
Example #18
Source File: SparkStreaming.java    From kafka-spark-avro-example with Apache License 2.0 5 votes vote down vote up
public static void main(String... args) {
  SparkConf conf = new SparkConf();
  conf.setMaster("local[2]");
  conf.setAppName("Spark Streaming Test Java");

  JavaSparkContext sc = new JavaSparkContext(conf);
  JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(10));

  processStream(ssc, sc);

  ssc.start();
  ssc.awaitTermination();
}
 
Example #19
Source File: WordCountTransformOpEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  
      System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
   List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
	    

   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  
   wordCounts.print();
   
JavaPairDStream<String, Integer> joinedDstream = wordCounts
		.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
			@Override
			public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
				JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair(
						new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() {
							@Override
							public Tuple2<String, Integer> call(
									Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception {
								return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2()));
							}
						});
				return modRDD;
			}
		});

   joinedDstream.print();
   streamingContext.start();
   streamingContext.awaitTermination();
 }
 
Example #20
Source File: StreamingService.java    From cxf with Apache License 2.0 5 votes vote down vote up
private void processStream(AsyncResponse async, List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        SparkStreamingOutput streamOut = new SparkStreamingOutput(jssc);
        SparkStreamingListener sparkListener = new SparkStreamingListener(streamOut);
        jssc.addStreamingListener(sparkListener);

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new OutputFunction(streamOut));
        jssc.start();

        executor.execute(new SparkJob(async, sparkListener));
    } catch (Exception ex) {
        // the compiler does not allow to catch SparkException directly
        if (ex instanceof SparkException) {
            async.cancel(60);
        } else {
            async.resume(new WebApplicationException(ex));
        }
    }
}
 
Example #21
Source File: SparkConsume.java    From kafka-streams-api-websockets with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
    Map<String, Object> kafkaParams = new HashMap<>();
    kafkaParams.put("bootstrap.servers", "localhost:9092");
    kafkaParams.put("key.deserializer", StringDeserializer.class);
    kafkaParams.put("value.deserializer", StringDeserializer.class);
    kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
    kafkaParams.put("auto.offset.reset", "latest");
    kafkaParams.put("enable.auto.commit", false);

    Collection<String> topics = Arrays.asList("data-in");

    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaSpark");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(5));

    final JavaInputDStream<ConsumerRecord<String, String>> stream =
            KafkaUtils.createDirectStream(
                    streamingContext,
                    LocationStrategies.PreferConsistent(),
                    ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
            );

    JavaPairDStream<String, Integer>  countOfMessageKeys = stream
            .map((ConsumerRecord<String, String> record) -> record.key())
            .mapToPair((String s) -> new Tuple2<>(s, 1))
            .reduceByKey((Integer i1, Integer i2)-> i1 + i2);

    countOfMessageKeys.print();

    // Start the computation
    streamingContext.start();
    streamingContext.awaitTermination();
}
 
Example #22
Source File: RealtimeTrafficDataProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 *
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

    // reduce by key and window (30 sec window and 10 sec slide).
    JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
            .mapToPair(iot -> new Tuple2<>(
                    new AggregateKey(iot.getRouteId(), iot.getVehicleType()),
                    1L
            ))
            .reduceByKeyAndWindow((a, b) -> a + b,
                    Durations.seconds(30),
                    Durations.seconds(10)
            );

    // Transform to dstream of TrafficData
    JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("routeId", "routeid");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("totalCount", "totalcount");
    columnNameMappings.put("timeStamp", "timestamp");
    columnNameMappings.put("recordDate", "recorddate");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream).writerBuilder(
            "traffickeyspace",
            "window_traffic",
            CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)
    ).saveToCassandra();
}
 
Example #23
Source File: FileStreamingEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) {
   	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
   	//Logger rootLogger = LogManager.getRootLogger();
  		//rootLogger.setLevel(Level.WARN); 
       SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
       String inputDirectory="E:\\hadoop\\streamFolder\\";
    
       JavaSparkContext sc = new JavaSparkContext(conf);
       JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1));
      // streamingContext.checkpoint("E:\\hadoop\\checkpoint");
       Logger rootLogger = LogManager.getRootLogger();
  		rootLogger.setLevel(Level.WARN); 
  		
  		JavaDStream<String> streamfile = streamingContext.textFileStream(inputDirectory);
  		streamfile.print();
  		streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x)));
  		
  			   		
  		JavaPairDStream<LongWritable, Text> streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class);
  	 streamedFile.print();
  		
  	 streamingContext.start();
  	 

       try {
		streamingContext.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example #24
Source File: RealtimeTrafficDataProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get total traffic counts of different type of vehicles for each route.
 *
 * @param filteredIotDataStream IoT data stream
 */
public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

    // We need to get count of vehicle group by routeId and vehicleType
    JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
            .mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
            .reduceByKey((a, b) -> a + b);

    // Need to keep state for total count
    StateSpec<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> stateSpec =
            StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600));

    JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair =
            countDStreamPair.mapWithState(stateSpec);//maintain state for one hour

    // Transform to dstream of TrafficData
    JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2);
    JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("routeId", "routeid");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("totalCount", "totalcount");
    columnNameMappings.put("timeStamp", "timestamp");
    columnNameMappings.put("recordDate", "recorddate");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream).writerBuilder(
            "traffickeyspace",
            "total_traffic",
            CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings)
    ).saveToCassandra();
}
 
Example #25
Source File: RealTimeHeatMapProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
/**
 * Reduces the dataset by counting the number of measurements for a specific grid box (rounded coordinate)
 *
 * @param measurements | The dataset of measurements
 * @return A set of tuples linking rounded coordinates to their number of occurrences
 */
private JavaPairDStream<Coordinate, Integer> countPerGridBox(JavaDStream<Measurement> measurements) {
    // reduce by key and window (30 sec window and 10 sec slide).
    return measurements.mapToPair(
            measurement -> new Tuple2<>(
                    measurement.getRoundedCoordinate(),
                    1
            )
    ).reduceByKeyAndWindow((a, b) -> a + b,
            Durations.seconds(30),
            Durations.seconds(10)
    );
}
 
Example #26
Source File: JavaDirectTalosStreamSuite.java    From galaxy-sdk-java with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() {
    SparkConf sparkConf = new SparkConf()
            .setMaster("local[4]").setAppName(this.getClass().getSimpleName());
    ssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(200));
    talosTestUtils = new TalosTestUtils(uri(), new HashMap<String, String>() {{
        put("auto.offset.reset", "smallest");
    }});
    talosTestUtils.deleteTopic(topic);
    talosTestUtils.createTopic(topic, 8);
}
 
Example #27
Source File: StreamingProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
private void start() throws Exception {
        Properties prop = PropertyFileReader.readPropertyFile(file);
        Map<String, Object> kafkaProperties = getKafkaParams(prop);
        String[] jars = {
        };
        SparkConf conf = getSparkConf(prop, jars);

        //batch interval of 5 seconds for incoming stream
        JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
        final SparkSession sparkSession = SparkSession.builder().config(conf).getOrCreate();
        jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir"));
        Map<TopicPartition, Long> lastOffSet = getLatestOffSet(sparkSession, prop);
//        Map<TopicPartition, Long> lastOffSet = Collections.EMPTY_MAP;
        JavaInputDStream<ConsumerRecord<String, IoTData>> directKafkaStream = getStream(prop, jssc, kafkaProperties, lastOffSet);

        logger.info("Starting Stream Processing");

        JavaDStream<IoTData> transformedStream = directKafkaStream.transform(item -> {
            return getEnhancedObjWithKafkaInfo(item);
        });

        processStream(prop, jssc, sparkSession, transformedStream);
        commitOffset(directKafkaStream);

        jssc.start();
        jssc.awaitTermination();
    }
 
Example #28
Source File: StreamingIngestionFileSystemTextFileToDataframeApp.java    From net.jgp.labs.spark with Apache License 2.0 4 votes vote down vote up
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
    private static final long serialVersionUID = -590010339928376829L;

    @Override
    public void call(JavaRDD<String> rdd) {
      JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
        private static final long serialVersionUID = 5167089361335095997L;

        @Override
        public Row call(String msg) {
          Row row = RowFactory.create(msg);
          return row;
        }
      });
      // Create Schema
      StructType schema = DataTypes.createStructType(
          new StructField[] { DataTypes.createStructField("Message",
              DataTypes.StringType, true) });

      // Get Spark 2.0 session
      SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context()
          .getConf());
      Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
      msgDataFrame.show();
    }
  });

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example #29
Source File: WordCountingAppWithCheckpoint.java    From tutorials with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        Logger.getLogger("org")
            .setLevel(Level.OFF);
        Logger.getLogger("akka")
            .setLevel(Level.OFF);

        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Collection<String> topics = Arrays.asList("messages");

        SparkConf sparkConf = new SparkConf();
        sparkConf.setMaster("local[2]");
        sparkConf.setAppName("WordCountingAppWithCheckpoint");
        sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");

        JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        sparkContext = streamingContext.sparkContext();

        streamingContext.checkpoint("./.checkpoint");

        JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));

        JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));

        JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());

        JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
            .iterator());

        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);

        JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> {
            int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
            Tuple2<String, Integer> output = new Tuple2<>(word, sum);
            state.update(sum);
            return output;
        }));

        cumulativeWordCounts.foreachRDD(javaRdd -> {
            List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
            for (Tuple2<String, Integer> tuple : wordCountList) {
                List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2));
                JavaRDD<Word> rdd = sparkContext.parallelize(wordList);
                javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
                    .saveToCassandra();
            }
        });

        streamingContext.start();
        streamingContext.awaitTermination();
    }
 
Example #30
Source File: SampleConsumer.java    From kafka-spark-consumer with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("deprecation")
private void run() {

  Properties props = new Properties();
  props.put("zookeeper.hosts", "zkhost");
  props.put("zookeeper.port", "2181");
  props.put("kafka.topic", "topicA,topicB,topicC");
  props.put("kafka.consumer.id", "kafka-consumer");
  // Optional Properties
  props.put("zookeeper.broker.path", "/brokers");
  props.put("zookeeper.consumer.path", "/consumers");
  props.put("consumer.forcefromstart", "false");
  props.put("max.poll.records", "10");
  props.put("consumer.fillfreqms", "500");
  props.put("consumer.backpressure.enabled", "true");
  //Kafka properties
  props.put("bootstrap.servers", "kafkahost-1:6667,"
          + "kafkahost-2:6667,"
          + "kafkahost-3:6667,"
          + "kafkahost-4:6667");
  props.put("security.protocol", "SSL");
  props.put("ssl.truststore.location","~/kafka-securitykafka.server.truststore.jks");
  props.put("ssl.truststore.password", "test1234");

  SparkConf _sparkConf = new SparkConf();
  JavaStreamingContext jsc = new JavaStreamingContext(_sparkConf, Durations.seconds(30));
  // Specify number of Receivers you need.
  int numberOfReceivers = 6;

  JavaDStream<MessageAndMetadata<byte[]>> unionStreams = ReceiverLauncher.launch(
      jsc, props, numberOfReceivers, StorageLevel.MEMORY_ONLY());

  unionStreams.foreachRDD(new VoidFunction<JavaRDD<MessageAndMetadata<byte[]>>>() {
    @Override
    public void call(JavaRDD<MessageAndMetadata<byte[]>> rdd) throws Exception {
      //Start Application Logic
      rdd.foreachPartition(new VoidFunction<Iterator<MessageAndMetadata<byte[]>>>() {
          @Override
          public void call(Iterator<MessageAndMetadata<byte[]>> mmItr) throws Exception {
              int countTopicA = 0;
              int countTopicB = 0;
              int countTopicC = 0;
              while(mmItr.hasNext()) {
                  MessageAndMetadata<byte[]> mm = mmItr.next();
                  if(mm.getTopic().equals("topicA")) {
                      countTopicA++;
                  }
                  else if (mm.getTopic().equals("topicB")) {
                      countTopicB++;
                  }
                  else if (mm.getTopic().equals("topicC")) {
                      countTopicC++;
                  }
              }
              System.out.println("topicA count " + countTopicA);
              System.out.println("topicB count " + countTopicB);
              System.out.println("topicC count " + countTopicC);
          }
      });
      System.out.println("RDD count " + rdd.count());
      //End Application Logic
      //commit offset
      System.out.println("Commiting Offset");
      ProcessedOffsetManager.persistsPartition(rdd, props);
    }
  });

  try {
    jsc.start();
    jsc.awaitTermination();
  }catch (Exception ex ) {
    jsc.ssc().sc().cancelAllJobs();
    jsc.stop(true, false);
    System.exit(-1);
  }
}