Java Code Examples for org.apache.spark.streaming.api.java.JavaStreamingContext#checkpoint()

The following examples show how to use org.apache.spark.streaming.api.java.JavaStreamingContext#checkpoint() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example 2
Source File: Window.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("window").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");

    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaDStream<String> winDstream = dStream.window(Durations.seconds(30), Durations.seconds(20));

    JavaDStream<Long> result = winDstream.count();

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example 3
Source File: ReduceByKeyAndWindow.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example 4
Source File: StreamingRsvpsDStreamCountWindow.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 6 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI)
                .set("spark.streaming.kafka.consumer.cache.enabled", "false");

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        streamingContext.checkpoint(CHECKPOINT_FOLDER);

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<Long> countStream  
            = meetupStream.countByWindow(
                 new Duration(WINDOW_LENGTH_MS), 
                 new Duration(SLIDING_INTERVAL_MS));        

        countStream.foreachRDD((JavaRDD<Long> countRDD) -> {                
            MongoSpark.save(        
                    countRDD.map(
                        r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}")
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });
        
        streamingContext.start();
        streamingContext.awaitTermination();    
    }
 
Example 5
Source File: SparkRunnerStreamingContextFactory.java    From beam with Apache License 2.0 6 votes vote down vote up
private void checkpoint(JavaStreamingContext jssc, CheckpointDir checkpointDir) {
  Path rootCheckpointPath = checkpointDir.getRootCheckpointDir();
  Path sparkCheckpointPath = checkpointDir.getSparkCheckpointDir();
  Path beamCheckpointPath = checkpointDir.getBeamCheckpointDir();

  try {
    FileSystem fileSystem =
        rootCheckpointPath.getFileSystem(jssc.sparkContext().hadoopConfiguration());
    if (!fileSystem.exists(rootCheckpointPath)) {
      fileSystem.mkdirs(rootCheckpointPath);
    }
    if (!fileSystem.exists(sparkCheckpointPath)) {
      fileSystem.mkdirs(sparkCheckpointPath);
    }
    if (!fileSystem.exists(beamCheckpointPath)) {
      fileSystem.mkdirs(beamCheckpointPath);
    }
  } catch (IOException e) {
    throw new RuntimeException("Failed to create checkpoint dir", e);
  }

  jssc.checkpoint(sparkCheckpointPath.toString());
}
 
Example 6
Source File: SparkStreamServiceImpl.java    From searchanalytics-bigdata with MIT License 6 votes vote down vote up
@Override
public void setup() {
	// Create a StreamingContext with a SparkConf configuration
	SparkConf sparkConf = new SparkConf(false)
			.setAppName("JaiSpark")
			.setSparkHome("target/sparkhome")
			.setMaster("local")
			.set("spark.executor.memory", "128m")
			.set("spark.local.dir",
					new File("target/sparkhome/tmp").getAbsolutePath())
			.set("spark.cores.max", "2").set("spark.akka.threads", "2")
			.set("spark.akka.timeout", "60").set("spark.logConf", "true")
			.set("spark.cleaner.delay", "3700")
			.set("spark.cleaner.ttl", "86400")
			.set("spark.shuffle.spill", "false")
			.set("spark.driver.host", "localhost")
			.set("spark.driver.port", "43214");
	jssc = new JavaStreamingContext(sparkConf, new Duration(5000));

	String checkpointDir = hadoopClusterService.getHDFSUri()
			+ "/sparkcheckpoint";
	jssc.checkpoint(checkpointDir);
	startFlumeStream();
}
 
Example 7
Source File: WordCountSocketStateful.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 System.setProperty("hadoop.home.dir", "E:\\hadoop");

   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   streamingContext.checkpoint("E:\\hadoop\\checkpoint");
// Initial state RDD input to mapWithState
   @SuppressWarnings("unchecked")
   List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
   
   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  


  // Update the cumulative count function
  Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc =
      new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
        @Override
        public Tuple2<String, Integer> call(String word, Optional<Integer> one,
            State<Integer> state) {
          int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
          Tuple2<String, Integer> output = new Tuple2<>(word, sum);
          state.update(sum);
          return output;
        }
      };

  // DStream made of get cumulative counts that get updated in every batch
  JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

  stateDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example 8
Source File: WordCountRecoverableEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
	SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
	JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
	streamingContext.checkpoint(checkpointDirectory);
	// Initial state RDD input to mapWithState
	@SuppressWarnings("unchecked")
	List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
	JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);

	JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);

	JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
			.reduceByKey((count1, count2) -> count1 + count2);

	// Update the cumulative count function
	Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
		@Override
		public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
			int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
			Tuple2<String, Integer> output = new Tuple2<>(word, sum);
			state.update(sum);
			return output;
		}
	};

	// DStream made of get cumulative counts that get updated in every batch
	JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
			.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

	stateDstream.print();
	return streamingContext;
}
 
Example 9
Source File: WindowBatchInterval.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
   	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
   	//Logger rootLogger = LogManager.getRootLogger();
  		//rootLogger.setLevel(Level.WARN); 
       SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
       
    
       JavaSparkContext sc = new JavaSparkContext(conf);
       JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
       streamingContext.checkpoint("E:\\hadoop\\checkpoint");
       Logger rootLogger = LogManager.getRootLogger();
  		rootLogger.setLevel(Level.WARN); 
  		
  	 List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
    JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
		    

    JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
    
    JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
   
    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
   
    wordCounts.print();
    wordCounts.window(Durations.minutes(8)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
    //comment these two operation to make it run
    wordCounts.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
       streamingContext.start();
       try {
		streamingContext.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 10
Source File: KafkaExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
 System.setProperty("hadoop.home.dir", "E:\\hadoop");
  	//Logger rootLogger = LogManager.getRootLogger();
 		//rootLogger.setLevel(Level.WARN); 
      SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");    
      JavaSparkContext sc = new JavaSparkContext(conf);
      JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
      streamingContext.checkpoint("E:\\hadoop\\checkpoint");
      Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
      Map<String, Object> kafkaParams = new HashMap<>();
      kafkaParams.put("bootstrap.servers", "10.0.75.1:9092");
      kafkaParams.put("key.deserializer", StringDeserializer.class);
      kafkaParams.put("value.deserializer", StringDeserializer.class);
      kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea");
      kafkaParams.put("auto.offset.reset", "latest");
     // kafkaParams.put("enable.auto.commit", false);

      Collection<String> topics = Arrays.asList("mytopic", "anothertopic");

      final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(),
      				ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));

      JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value()));
     
      pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));});
     
      JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText());
      
      tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x)));
      
     JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() );
 
      hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x)));
      
      JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue();
      
      cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30))
                .countByValue()
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30))
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2())));
      */
     hashtagRDD.window(Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     
     /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     
      streamingContext.start();
      try {
	streamingContext.awaitTermination();
} catch (InterruptedException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
  }
 
Example 11
Source File: StateFulProcessingExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");

		SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
				Durations.milliseconds(1000));
		JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999);
		jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint");

		JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> {
			ObjectMapper mapper = new ObjectMapper();
			return mapper.readValue(x, FlightDetails.class);
		});
		
		

		JavaPairDStream<String, FlightDetails> flightDetailsPairStream = flightDetailsStream
				.mapToPair(f -> new Tuple2<String, FlightDetails>(f.getFlightId(), f));

		Function3<String, Optional<FlightDetails>, State<List<FlightDetails>>, Tuple2<String, Double>> mappingFunc = (
				flightId, curFlightDetail, state) -> {
			List<FlightDetails> details = state.exists() ? state.get() : new ArrayList<>();

			boolean isLanded = false;

			if (curFlightDetail.isPresent()) {
				details.add(curFlightDetail.get());
				if (curFlightDetail.get().isLanded()) {
					isLanded = true;
				}
			}
			Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0);

			if (isLanded) {
				state.remove();
			} else {
				state.update(details);
			}
			return new Tuple2<String, Double>(flightId, avgSpeed);
		};

		JavaMapWithStateDStream<String, FlightDetails, List<FlightDetails>, Tuple2<String, Double>> streamWithState = flightDetailsPairStream
				.mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5)));
		
		streamWithState.print();
		jssc.start();
		jssc.awaitTermination();
	}
 
Example 12
Source File: JavaStreamingTestExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length != 3) {
    System.err.println("Usage: JavaStreamingTestExample " +
      "<dataDir> <batchDuration> <numBatchesTimeout>");
      System.exit(1);
  }

  String dataDir = args[0];
  Duration batchDuration = Seconds.apply(Long.parseLong(args[1]));
  int numBatchesTimeout = Integer.parseInt(args[2]);

  SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample");
  JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration);

  ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString());

  // $example on$
  JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map(
    new Function<String, BinarySample>() {
      @Override
      public BinarySample call(String line) {
        String[] ts = line.split(",");
        boolean label = Boolean.parseBoolean(ts[0]);
        double value = Double.parseDouble(ts[1]);
        return new BinarySample(label, value);
      }
    });

  StreamingTest streamingTest = new StreamingTest()
    .setPeacePeriod(0)
    .setWindowSize(0)
    .setTestMethod("welch");

  JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data);
  out.print();
  // $example off$

  // Stop processing if test becomes significant or we time out
  timeoutCounter = numBatchesTimeout;

  out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() {
    @Override
    public void call(JavaRDD<StreamingTestResult> rdd) {
      timeoutCounter -= 1;

      boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() {
        @Override
        public Boolean call(StreamingTestResult v) {
          return v.pValue() < 0.05;
        }
      }).isEmpty();

      if (timeoutCounter <= 0 || anySignificant) {
        rdd.context().stop();
      }
    }
  });

  ssc.start();
  ssc.awaitTermination();
}
 
Example 13
Source File: JavaRecoverableNetworkWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
private static JavaStreamingContext createContext(String ip,
                                                  int port,
                                                  String checkpointDirectory,
                                                  String outputPath) {

  // If you do not see this printed, that means the StreamingContext has been loaded
  // from the new checkpoint
  System.out.println("Creating new context");
  final File outputFile = new File(outputPath);
  if (outputFile.exists()) {
    outputFile.delete();
  }
  SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount");
  // Create the context with a 1 second batch size
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  ssc.checkpoint(checkpointDirectory);

  // Create a socket stream on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port);
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override
      public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() {
    @Override
    public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException {
      // Get or register the blacklist Broadcast
      final Broadcast<List<String>> blacklist =
          JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context()));
      // Get or register the droppedWordsCounter Accumulator
      final LongAccumulator droppedWordsCounter =
          JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context()));
      // Use blacklist to drop words and use droppedWordsCounter to count them
      String counts = rdd.filter(new Function<Tuple2<String, Integer>, Boolean>() {
        @Override
        public Boolean call(Tuple2<String, Integer> wordCount) {
          if (blacklist.value().contains(wordCount._1())) {
            droppedWordsCounter.add(wordCount._2());
            return false;
          } else {
            return true;
          }
        }
      }).collect().toString();
      String output = "Counts at time " + time + " " + counts;
      System.out.println(output);
      System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally");
      System.out.println("Appending to " + outputFile.getAbsolutePath());
      Files.append(output + "\n", outputFile, Charset.defaultCharset());
    }
  });

  return ssc;
}
 
Example 14
Source File: IoTDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
 //read Spark and Cassandra properties and create SparkConf
 Properties prop = PropertyFileReader.readPropertyFile();		
 SparkConf conf = new SparkConf()
		 .setAppName(prop.getProperty("com.iot.app.spark.app.name"))
		 .setMaster(prop.getProperty("com.iot.app.spark.master"))
		 .set("spark.cassandra.connection.host", prop.getProperty("com.iot.app.cassandra.host"))
		 .set("spark.cassandra.connection.port", prop.getProperty("com.iot.app.cassandra.port"))
		 .set("spark.cassandra.connection.keep_alive_ms", prop.getProperty("com.iot.app.cassandra.keep_alive"));		 
 //batch interval of 5 seconds for incoming stream		 
 JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));	
 //add check point directory
 jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir"));
 
 //read and set Kafka properties
 Map<String, String> kafkaParams = new HashMap<String, String>();
 kafkaParams.put("zookeeper.connect", prop.getProperty("com.iot.app.kafka.zookeeper"));
 kafkaParams.put("metadata.broker.list", prop.getProperty("com.iot.app.kafka.brokerlist"));
 String topic = prop.getProperty("com.iot.app.kafka.topic");
 Set<String> topicsSet = new HashSet<String>();
 topicsSet.add(topic);
 //create direct kafka stream
 JavaPairInputDStream<String, IoTData> directKafkaStream = KafkaUtils.createDirectStream(
	        jssc,
	        String.class,
	        IoTData.class,
	        StringDecoder.class,
	        IoTDataDecoder.class,
	        kafkaParams,
	        topicsSet
	    );
 logger.info("Starting Stream Processing");
 
 //We need non filtered stream for poi traffic data calculation
 JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2());
 
 //We need filtered stream for total and traffic data calculation
 JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a );

 // Check vehicle Id is already processed
 JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData,Boolean>> iotDStreamWithStatePairs = iotDataPairStream
					.mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour

 // Filter processed vehicle ids and keep un-processed
 JavaDStream<Tuple2<IoTData,Boolean>> filteredIotDStreams = iotDStreamWithStatePairs.map(tuple2 -> tuple2)
					.filter(tuple -> tuple._2.equals(Boolean.FALSE));

 // Get stream of IoTdata
 JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1);
 
 //cache stream as it is used in total and window based computation
 filteredIotDataStream.cache();
 	 
 //process data
 IoTTrafficDataProcessor iotTrafficProcessor = new IoTTrafficDataProcessor();
 iotTrafficProcessor.processTotalTrafficData(filteredIotDataStream);
 iotTrafficProcessor.processWindowTrafficData(filteredIotDataStream);

 //poi data
 POIData poiData = new POIData();
 poiData.setLatitude(33.877495);
 poiData.setLongitude(-95.50238);
 poiData.setRadius(30);//30 km
 
 //broadcast variables. We will monitor vehicles on Route 37 which are of type Truck
 Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues = jssc.sparkContext().broadcast(new Tuple3<>(poiData,"Route-37","Truck"));
 //call method  to process stream
 iotTrafficProcessor.processPOIData(nonFilteredIotDataStream,broadcastPOIValues);
 
 //start context
 jssc.start();            
 jssc.awaitTermination();  
}
 
Example 15
Source File: WordCountingAppWithCheckpoint.java    From tutorials with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        Logger.getLogger("org")
            .setLevel(Level.OFF);
        Logger.getLogger("akka")
            .setLevel(Level.OFF);

        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Collection<String> topics = Arrays.asList("messages");

        SparkConf sparkConf = new SparkConf();
        sparkConf.setMaster("local[2]");
        sparkConf.setAppName("WordCountingAppWithCheckpoint");
        sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");

        JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        sparkContext = streamingContext.sparkContext();

        streamingContext.checkpoint("./.checkpoint");

        JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));

        JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));

        JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());

        JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
            .iterator());

        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);

        JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> {
            int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
            Tuple2<String, Integer> output = new Tuple2<>(word, sum);
            state.update(sum);
            return output;
        }));

        cumulativeWordCounts.foreachRDD(javaRdd -> {
            List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
            for (Tuple2<String, Integer> tuple : wordCountList) {
                List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2));
                JavaRDD<Word> rdd = sparkContext.parallelize(wordList);
                javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
                    .saveToCassandra();
            }
        });

        streamingContext.start();
        streamingContext.awaitTermination();
    }