Java Code Examples for org.apache.spark.streaming.api.java.JavaStreamingContext#start()

The following examples show how to use org.apache.spark.streaming.api.java.JavaStreamingContext#start() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example 2
Source File: ReduceByKeyAndWindow.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example 3
Source File: SparkStreamingFromNetworkExample.java    From SparkOnALog with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  if (args.length < 3) {
    System.err.println("Usage: NetworkWordCount <master> <hostname> <port>\n" +
        "In local mode, <master> should be 'local[n]' with n > 1");
    System.exit(1);
  }

  // Create the context with a 1 second batch size
  JavaStreamingContext ssc = new JavaStreamingContext(args[0], "NetworkWordCount",
          new Duration(5000), System.getenv("SPARK_HOME"), System.getenv("SPARK_EXAMPLES_JAR"));

  // Create a NetworkInputDStream on target ip:port and count the
  // words in input stream of \n delimited test (eg. generated by 'nc')
  JavaDStream<String> lines = ssc.socketTextStream(args[1], Integer.parseInt(args[2]));
  
  lines.map(new Function<String, String> () {

@Override
public String call(String arg0) throws Exception {
	System.out.println("arg0" + arg0);
	return arg0;
}}).print();
  
  lines.print();
  ssc.start();


}
 
Example 4
Source File: ReaderWriterExample.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        SparkSession spark = SparkSession.builder().getOrCreate();

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }
 
Example 5
Source File: JavaRecoverableNetworkWordCount.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length != 4) {
    System.err.println("You arguments were " + Arrays.asList(args));
    System.err.println(
        "Usage: JavaRecoverableNetworkWordCount <hostname> <port> <checkpoint-directory>\n" +
        "     <output-file>. <hostname> and <port> describe the TCP server that Spark\n" +
        "     Streaming would connect to receive data. <checkpoint-directory> directory to\n" +
        "     HDFS-compatible file system which checkpoint data <output-file> file to which\n" +
        "     the word counts will be appended\n" +
        "\n" +
        "In local mode, <master> should be 'local[n]' with n > 1\n" +
        "Both <checkpoint-directory> and <output-file> must be absolute paths");
    System.exit(1);
  }

  final String ip = args[0];
  final int port = Integer.parseInt(args[1]);
  final String checkpointDirectory = args[2];
  final String outputPath = args[3];

  // Function to create JavaStreamingContext without any output operations
  // (used to detect the new context)
  Function0<JavaStreamingContext> createContextFunc = new Function0<JavaStreamingContext>() {
    @Override
    public JavaStreamingContext call() {
      return createContext(ip, port, checkpointDirectory, outputPath);
    }
  };

  JavaStreamingContext ssc =
    JavaStreamingContext.getOrCreate(checkpointDirectory, createContextFunc);
  ssc.start();
  ssc.awaitTermination();
}
 
Example 6
Source File: Runner.java    From envelope with Apache License 2.0 4 votes vote down vote up
/**
 * Run the Envelope pipeline as a Spark Streaming job.
 * @param steps The full configuration of the Envelope pipeline
 */
@SuppressWarnings("unchecked")
private void runStreaming(final Set<Step> steps) throws Exception {
  final Set<Step> independentNonStreamingSteps = StepUtils.getIndependentNonStreamingSteps(steps);
  runBatch(independentNonStreamingSteps);

  Set<StreamingStep> streamingSteps = StepUtils.getStreamingSteps(steps);
  for (final StreamingStep streamingStep : streamingSteps) {
    LOG.debug("Setting up streaming step: " + streamingStep.getName());

    JavaDStream stream = streamingStep.getStream();

    stream.foreachRDD(new VoidFunction<JavaRDD<?>>() {
      @Override
      public void call(JavaRDD<?> raw) throws Exception {
        // Some independent steps might be repeating steps that have been flagged for reload
        StepUtils.resetRepeatingSteps(steps);
        // This will run any batch steps (and dependents) that are not submitted
        runBatch(independentNonStreamingSteps);

        streamingStep.setData(streamingStep.translate(raw));
        streamingStep.writeData();
        streamingStep.setState(StepState.FINISHED);

        Set<Step> batchSteps = StepUtils.mergeLoadedSteps(steps, streamingStep, baseConfig);
        Set<Step> dependentSteps = StepUtils.getAllDependentSteps(streamingStep, batchSteps);
        batchSteps.add(streamingStep);
        batchSteps.addAll(streamingStep.loadNewBatchSteps());
        batchSteps.addAll(independentNonStreamingSteps);
        runBatch(batchSteps);

        StepUtils.resetSteps(dependentSteps);

        streamingStep.recordProgress(raw);
      }
    });

    LOG.debug("Finished setting up streaming step: " + streamingStep.getName());
  }

  JavaStreamingContext jsc = Contexts.getJavaStreamingContext();
  jsc.start();
  LOG.debug("Streaming context started");
  jsc.awaitTermination();
  LOG.debug("Streaming context terminated");
}
 
Example 7
Source File: FraudDetectionApp.java    From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {

        String brokers = "localhost:9092";
        String topics = "iplog";
        CacheIPLookup cacheIPLookup = new CacheIPLookup();
        SparkConf sparkConf = new SparkConf().setAppName("IP_FRAUD");
        JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(2));

        Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
        Map<String, String> kafkaConfiguration = new HashMap<>();
        kafkaConfiguration.put("metadata.broker.list", brokers);
        kafkaConfiguration.put("group.id", "ipfraud");
        kafkaConfiguration.put("auto.offset.reset", "smallest");

        JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
                javaStreamingContext,
                String.class,
                String.class,
                StringDecoder.class,
                StringDecoder.class,
                kafkaConfiguration,
                topicsSet
        );

        JavaDStream<String> lines = messages.map(Tuple2::_2);

        JavaDStream<String> fraudIPs = lines.filter(new Function<String, Boolean>() {
            @Override
            public Boolean call(String s) throws Exception {
                String IP = s.split(" ")[0];
                String[] ranges = IP.split("\\.");
                String range = null;
                try {
                    range = ranges[0] + "." + ranges[1];
                } catch (ArrayIndexOutOfBoundsException ex) {

                }
                return cacheIPLookup.isFraudIP(range);

            }
        });

        DStream<String> fraudDstream = fraudIPs.dstream();
        fraudDstream.saveAsTextFiles("FraudRecord", "");

        javaStreamingContext.start();
        javaStreamingContext.awaitTermination();
    }
 
Example 8
Source File: Throughput.java    From flink-perf with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("throughput").setMaster("local[8]");
	JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(2000));

	JavaReceiverInputDStream<Tuple4<Long, Integer, Long, byte[]>> source = ssc.receiverStream(new Source(StorageLevel.MEMORY_ONLY()));
	JavaPairDStream<Long, Tuple3<Integer, Long, byte[]>> kvsource = source.mapToPair(new PairFunction<Tuple4<Long, Integer, Long, byte[]>, Long, Tuple3<Integer, Long, byte[]>>() {
		@Override
		public Tuple2<Long, Tuple3<Integer, Long, byte[]>> call(Tuple4<Long, Integer, Long, byte[]> longIntegerLongTuple4) throws Exception {
			return new Tuple2<Long, Tuple3<Integer, Long, byte[]>>(longIntegerLongTuple4._1(),
					new Tuple3<Integer, Long, byte[]>(longIntegerLongTuple4._2(), longIntegerLongTuple4._3(), longIntegerLongTuple4._4()));
		}
	});
	JavaDStream<Long> res = kvsource.repartition(3).mapPartitions(new FlatMapFunction<Iterator<Tuple2<Long,Tuple3<Integer,Long,byte[]>>>, Long>() {
		@Override
		public Iterable<Long> call(Iterator<Tuple2<Long, Tuple3<Integer, Long, byte[]>>> tuple2Iterator) throws Exception {
			long start = System.currentTimeMillis();
			long received = 0;
			while(tuple2Iterator.hasNext()) {
				received++;
				Tuple2<Long, Tuple3<Integer, Long, byte[]>> el = tuple2Iterator.next();

				if (el._2()._2() != 0) {
					long lat = System.currentTimeMillis() - el._2()._2();
					System.out.println("Latency " + lat + " ms");
				}
			}
			long sinceMs = (System.currentTimeMillis() - start);

			System.out.println("Finished Batch. Processed "+received+" elements in "+sinceMs+" ms.");

			return new Iterable<Long>() {
				@Override
				public Iterator<Long> iterator() {
					return new Iterator<Long>() {
						@Override
						public boolean hasNext() {
							return false;
						}

						@Override
						public Long next() {
							return 1L;
						}

						@Override
						public void remove() {

						}
					};
				}
			};
		}


	/*	@Override
		public Long call(Tuple2<Long, Tuple3<Integer, Long, byte[]>> v1) throws Exception {
			//	System.out.println("Recevied " + v1);
			if (start == 0) {

			}
			received++;
			if (received % logfreq == 0) {

				if (sinceSec == 0) {
					System.out.println("received " + received + " elements since 0");
					return 0L;
				}
				System.out.println("Received " + received + " elements since " + sinceSec + ". " +
						"Elements per second " + received / sinceSec + ", GB received " + ((received * (8 + 4 + 12)) / 1024 / 1024 / 1024));
			}
			if (v1._2()._2() != 0) {
				long lat = System.currentTimeMillis() - v1._2()._2();
				System.out.println("Latency " + lat + " ms");
			}

			return received;
		} */
	});


	//res.print();
	/*res.foreachRDD(new Function2<JavaRDD<Long>, Time, Void>() {
		@Override
		public Void call(JavaRDD<Long> integerJavaRDD, Time t) throws Exception {
			integerJavaRDD.saveAsTextFile("/home/robert/flink-workdir/flink-perf/out/"+t.toString());
			return null;
		}
	}); */
	res.print();
//	res.print();

	ssc.start();
}
 
Example 9
Source File: StateFulProcessingExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");

		SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
				Durations.milliseconds(1000));
		JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999);
		jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint");

		JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> {
			ObjectMapper mapper = new ObjectMapper();
			return mapper.readValue(x, FlightDetails.class);
		});
		
		

		JavaPairDStream<String, FlightDetails> flightDetailsPairStream = flightDetailsStream
				.mapToPair(f -> new Tuple2<String, FlightDetails>(f.getFlightId(), f));

		Function3<String, Optional<FlightDetails>, State<List<FlightDetails>>, Tuple2<String, Double>> mappingFunc = (
				flightId, curFlightDetail, state) -> {
			List<FlightDetails> details = state.exists() ? state.get() : new ArrayList<>();

			boolean isLanded = false;

			if (curFlightDetail.isPresent()) {
				details.add(curFlightDetail.get());
				if (curFlightDetail.get().isLanded()) {
					isLanded = true;
				}
			}
			Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0);

			if (isLanded) {
				state.remove();
			} else {
				state.update(details);
			}
			return new Tuple2<String, Double>(flightId, avgSpeed);
		};

		JavaMapWithStateDStream<String, FlightDetails, List<FlightDetails>, Tuple2<String, Double>> streamWithState = flightDetailsPairStream
				.mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5)));
		
		streamWithState.print();
		jssc.start();
		jssc.awaitTermination();
	}
 
Example 10
Source File: KafkaExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
 System.setProperty("hadoop.home.dir", "E:\\hadoop");
  	//Logger rootLogger = LogManager.getRootLogger();
 		//rootLogger.setLevel(Level.WARN); 
      SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");    
      JavaSparkContext sc = new JavaSparkContext(conf);
      JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
      streamingContext.checkpoint("E:\\hadoop\\checkpoint");
      Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
      Map<String, Object> kafkaParams = new HashMap<>();
      kafkaParams.put("bootstrap.servers", "10.0.75.1:9092");
      kafkaParams.put("key.deserializer", StringDeserializer.class);
      kafkaParams.put("value.deserializer", StringDeserializer.class);
      kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea");
      kafkaParams.put("auto.offset.reset", "latest");
     // kafkaParams.put("enable.auto.commit", false);

      Collection<String> topics = Arrays.asList("mytopic", "anothertopic");

      final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(),
      				ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));

      JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value()));
     
      pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));});
     
      JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText());
      
      tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x)));
      
     JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() );
 
      hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x)));
      
      JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue();
      
      cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30))
                .countByValue()
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30))
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2())));
      */
     hashtagRDD.window(Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     
     /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     
      streamingContext.start();
      try {
	streamingContext.awaitTermination();
} catch (InterruptedException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
  }
 
Example 11
Source File: JavaHDFSWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
/**
 * To run this on your local machine, you need to first run a Netcat server
 * `$ nc -lk 9999` and then run the example `$ bin/run-example
 * org.apache.spark.examples.streaming.JavaNetworkWordCount localhost 9999`
 */
public static void main(String[] args) {
	SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount").setMaster("local[5]");
	/*
	 * 创建该对象类似于spark core中的JavaSparkContext
	 * 该对象除了接受SparkConf对象,还接收了一个BatchInterval参数,就算说,
	 * 没收集多长时间去划分一个人Batch即RDD去执行
	 */
	JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5));

	/*
	 * 首先创建输入DStream,代表一个数据比如这里从socket或KafKa来持续不断的进入实时数据流
	 * 创建一个监听Socket数据量,RDD里面的每一个元素就是一行行的文本
	 */
	JavaDStream<String> lines = ssc.textFileStream("hdfs://master:8020/wordcount_dir");

	JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String x) {
			return Lists.newArrayList(SPACE.split(x)).iterator();
		}
	});
	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
		@Override
		public Tuple2<String, Integer> call(String s) {
			return new Tuple2<String, Integer>(s, 1);
		}
	}).reduceByKey(new Function2<Integer, Integer, Integer>() {
		@Override
		public Integer call(Integer i1, Integer i2) {
			return i1 + i2;
		}
	});

	wordCounts.print();
	ssc.start();
	try {
		ssc.awaitTermination();
	} catch (Exception e) {
		e.printStackTrace();
	}
}
 
Example 12
Source File: JavaKafkaDirectWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
/**
 * 1.一对一
 * 2.高效
 * 3.准确的只计算一次
 *
 * @param args
 */
public static void main(String[] args) {
    StreamingExamples.setStreamingLogLevels();
    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaDirectWordCount").setMaster("local[1]");
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(6));

    Map<String, String> kafkaParams = new HashMap<String, String>(); // key是topic名称,value是线程数量
    kafkaParams.put("metadata.broker.list", "master:9092,slave1:9092,slave2:9092"); // 指定broker在哪
    HashSet<String> topicsSet = new HashSet<String>();
    topicsSet.add("2017-7-26"); // 指定操作的topic

    // Create direct kafka stream with brokers and topics createDirectStream()
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet
    );

    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
            return tuple2._2();
        }
    });

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Lists.newArrayList(SPACE.split(x)).iterator();
        }
    });

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example 13
Source File: JavaKafkaWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 4) {
    System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();
  SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
  // Create the context with 2 seconds batch size
  JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));

  int numThreads = Integer.parseInt(args[3]);
  Map<String, Integer> topicMap = new HashMap<>();
  String[] topics = args[2].split(",");
  for (String topic: topics) {
    topicMap.put(topic, numThreads);
  }

  JavaPairReceiverInputDStream<String, String> messages =
          KafkaUtils.createStream(jssc, args[0], args[1], topicMap);

  JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
    @Override
    public String call(Tuple2<String, String> tuple2) {
      return tuple2._2();
    }
  });

  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });

  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override
      public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  jssc.start();
  jssc.awaitTermination();
}
 
Example 14
Source File: WindowBatchInterval.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
   	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
   	//Logger rootLogger = LogManager.getRootLogger();
  		//rootLogger.setLevel(Level.WARN); 
       SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
       
    
       JavaSparkContext sc = new JavaSparkContext(conf);
       JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
       streamingContext.checkpoint("E:\\hadoop\\checkpoint");
       Logger rootLogger = LogManager.getRootLogger();
  		rootLogger.setLevel(Level.WARN); 
  		
  	 List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
    JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
		    

    JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
    
    JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
   
    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
   
    wordCounts.print();
    wordCounts.window(Durations.minutes(8)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
    //comment these two operation to make it run
    wordCounts.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
    wordCounts.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
      .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
       streamingContext.start();
       try {
		streamingContext.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 15
Source File: JavaNetworkWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

  // Create a JavaReceiverInputDStream on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  // Note that no duplication in storage level only for running locally.
  // Replication necessary in distributed scenario for fault tolerance.
  JavaReceiverInputDStream<String> lines = ssc.socketTextStream(
          args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER);
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override
      public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}
 
Example 16
Source File: StreamingIngestionFileSystemTextFileToDataframeApp.java    From net.jgp.labs.spark with Apache License 2.0 4 votes vote down vote up
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
    private static final long serialVersionUID = -590010339928376829L;

    @Override
    public void call(JavaRDD<String> rdd) {
      JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
        private static final long serialVersionUID = 5167089361335095997L;

        @Override
        public Row call(String msg) {
          Row row = RowFactory.create(msg);
          return row;
        }
      });
      // Create Schema
      StructType schema = DataTypes.createStructType(
          new StructField[] { DataTypes.createStructField("Message",
              DataTypes.StringType, true) });

      // Get Spark 2.0 session
      SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context()
          .getConf());
      Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
      msgDataFrame.show();
    }
  });

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example 17
Source File: JavaSqlNetworkWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

  // Create a JavaReceiverInputDStream on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  // Note that no duplication in storage level only for running locally.
  // Replication necessary in distributed scenario for fault tolerance.
  JavaReceiverInputDStream<String> lines = ssc.socketTextStream(
      args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER);
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });

  // Convert RDDs of the words DStream to DataFrame and run SQL query
  words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() {
    @Override
    public void call(JavaRDD<String> rdd, Time time) {
      SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());

      // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame
      JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() {
        @Override
        public JavaRecord call(String word) {
          JavaRecord record = new JavaRecord();
          record.setWord(word);
          return record;
        }
      });
      Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class);

      // Creates a temporary view using the DataFrame
      wordsDataFrame.createOrReplaceTempView("words");

      // Do word count on table using SQL and print it
      Dataset<Row> wordCountsDataFrame =
          spark.sql("select word, count(*) as total from words group by word");
      System.out.println("========= " + time + "=========");
      wordCountsDataFrame.show();
    }
  });

  ssc.start();
  ssc.awaitTermination();
}
 
Example 18
Source File: SparkStreamingPulsarReceiverExample.java    From pulsar with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 3) {
    System.err.println("Missing parameters!");
    System.err.println("Usage: <pulsar-service-url> <topic> <sub>");
    return;
  }

  String serviceUrl =  args[0];
  String inputTopic =  args[1];
  String subscription =  args[2];
  System.out.println("Parameters:");
  System.out.println("\tServiceUrl:\t" + serviceUrl);
  System.out.println("\tTopic:\t" + inputTopic);
  System.out.println("\tSubscription:\t" + subscription);

  SparkConf sparkConf = new SparkConf().setAppName("Pulsar Spark Example");

  JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(60));

  ConsumerConfigurationData<byte[]> pulsarConf = new ConsumerConfigurationData();

  Set<String> set = new HashSet<>();
  set.add(inputTopic);
  pulsarConf.setTopicNames(set);
  pulsarConf.setSubscriptionName(subscription);

  SparkStreamingPulsarReceiver pulsarReceiver = new SparkStreamingPulsarReceiver(
      serviceUrl,
      pulsarConf,
      new AuthenticationDisabled());

  JavaReceiverInputDStream<byte[]> lineDStream = jsc.receiverStream(pulsarReceiver);
  JavaPairDStream<String, Integer> result = lineDStream.flatMap(x -> {
      String line = new String(x, StandardCharsets.UTF_8);
      List<String> list = Arrays.asList(line.split(" "));
      return list.iterator();
    })
      .mapToPair(x -> new Tuple2<String, Integer>(x, 1))
      .reduceByKey((x, y) -> x + y);

  result.print();

  jsc.start();
  jsc.awaitTermination();
}
 
Example 19
Source File: StateLess.java    From sparkResearch with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("StateLess");

    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

    JavaReceiverInputDStream<String> inputDStream = streamingContext.socketTextStream("localhost", 8080);

    JavaDStream<String> dStream = inputDStream.flatMap((FlatMapFunction<String, String>) s -> Arrays.asList(SPACE.split(s)).iterator());

    JavaPairDStream<String, Integer> pairDStream = dStream.mapToPair(new LogTuple());

    JavaPairDStream<String, Integer> result = pairDStream.reduceByKey(new ReduceIsKey());

    //JOIN
    JavaPairDStream<String, Integer> pairDStream1 = dStream.mapToPair(new LogTuple());

    JavaPairDStream<String, Integer> result1 = pairDStream.reduceByKey(new ReduceIsKey());

    JavaPairDStream<String, Tuple2<Integer, Integer>> c = result.join(result);


    result.foreachRDD(rdd -> {
        rdd.foreachPartition(partitionOfRecords -> {
            Connection connection = ConnectionPool.getConnection();
            Tuple2<String, Integer> wordCount;
            while (partitionOfRecords.hasNext()) {
                wordCount = partitionOfRecords.next();
                String sql = "insert into wordcount(word,count) " + "values('" + wordCount._1 + "',"
                        + wordCount._2 + ")";
                Statement stmt = connection.createStatement();
                stmt.executeUpdate(sql);
            }
            ConnectionPool.returnConnection(connection);
        });
    });

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
        streamingContext.close();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

}
 
Example 20
Source File: SparkStreamingFromFlumeExampleOld.java    From SparkOnALog with Apache License 2.0 2 votes vote down vote up
public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: JavaFlumeEventCount <master> <host> <port> <nameOfJar>");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String nameOfJar = args[3];

	Duration batchInterval = new Duration(2000);

	System.out.println("-Starting Spark Context");
	
	
	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval, master, nameOfJar);

	//sc.ssc()
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream("localhost",
	//		port);

	System.out.println("-Setting up Flume Stream");
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	//flumeStream.count();

	System.out.println("-count.map");
	
	flumeStream.count().map(new Function<Long, String>() {
		@Override
		public String call(Long in) {
			return "Received " + in + " flume events.";
		}
	}).print();

	System.out.println("-Starting Spark Context");
	
	sc.start();
	
	System.out.println("-Finished");
}