Java Code Examples for org.apache.spark.streaming.api.java.JavaDStream.foreachRDD()

The following are Jave code examples for showing how to use foreachRDD() of the org.apache.spark.streaming.api.java.JavaDStream class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
Example 1
Project: gcp   File: Spark4KafkaNew.java   Source Code and License Vote up 11 votes
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Kafka-New");
  
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    
    JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
        jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
        Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT),
        Collections.singleton(EXAMPLE_TOPIC));

    JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 2
Project: gcp   File: Spark2Streaming.java   Source Code and License Vote up 10 votes
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Streaming");
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    //JavaDStream<SampleXML> records = jsc.textFileStream("input/").map(new ParseXML());
    //textFileStream process lines of files, so xml has to be 1 line to work //alternative below

    JavaRDD<String> files = jsc.sparkContext().wholeTextFiles("input/").map(tuple -> tuple._2());
    Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
    rddQueue.add(files);
    JavaDStream<String> records = jsc.queueStream(rddQueue);

    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 3
Project: gcp   File: Spark6BigQuery.java   Source Code and License Vote up 8 votes
public static void main(String[] args) throws InterruptedException, IOException {
  SparkConf sc = new SparkConf().setAppName("POC-BigQuery");
  
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(60000))) {
    JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
        jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
        Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT), Collections.singleton(EXAMPLE_TOPIC));

    Configuration conf = new Configuration();
    BigQueryConfiguration.configureBigQueryOutput(conf, BQ_EXAMPLE_TABLE, BQ_EXAMPLE_SCHEMA);
    conf.set("mapreduce.job.outputformat.class", BigQueryOutputFormat.class.getName());

    JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
    records.foreachRDD(rdd -> {
      System.out.printf("Amount of XMLs: %d\n", rdd.count());
      long time = System.currentTimeMillis();
      rdd.mapToPair(new PrepToBQ()).saveAsNewAPIHadoopDataset(conf);
      System.out.printf("Sent to BQ in %fs\n", (System.currentTimeMillis()-time)/1000f);
    });
    
    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 4
Project: SparkToParquet   File: AppMain.java   Source Code and License Vote up 8 votes
public static void main(String[] args) throws IOException {
	Flags.setFromCommandLineArgs(THE_OPTIONS, args);

	// 初始化Spark Conf.
	SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
	JavaSparkContext sc = new JavaSparkContext(conf);
	JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
	SQLContext sqlContext = new SQLContext(sc);

	// 初始化参数
	HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
	HashMap<String, String> kafkaParams = new HashMap<String, String>();
	kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());

	// 从Kafka Stream获取数据
	JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
			StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		private static final long serialVersionUID = 5266880065425088203L;

		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
		List<ApacheAccessLog> list = new ArrayList<>();
		try {
			// 映射每一行
			list.add(ApacheAccessLog.parseFromLogLine(line));
			return list;
		} catch (RuntimeException e) {
			return list;
		}
	}).cache();

	accessLogsDStream.foreachRDD(rdd -> {

		// rdd to DataFrame
		DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
		// 写入Parquet文件
		df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

		return null;
	});

	// 启动Streaming服务器
	jssc.start(); // 启动计算
	jssc.awaitTermination(); // 等待终止
}
 
Example 5
Project: jMetalSP   File: SimpleSparkStreamingCounterDataSource.java   Source Code and License Vote up 6 votes
@Override
public void run() {
	JMetalLogger.logger.info("Run method in the streaming data source invoked") ;
   JMetalLogger.logger.info("Directory: " + directoryName) ;

	JavaDStream<Integer> time = streamingContext
					.textFileStream(directoryName)
					.map(line -> Integer.parseInt(line)) ;

	time.foreachRDD(numbers -> {
		List<Integer> numberList = numbers.collect() ;
		for (Integer number : numberList) {
		  System.out.println(number) ;
       observable.setChanged();
			observable.notifyObservers(new SingleObservedData<Integer>(number));
		}
	}) ;
}
 
Example 6
Project: net.jgp.labs.spark   File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java   Source Code and License Vote up 6 votes
private void start() {
	// Create a local StreamingContext with two working thread and batch interval of
	// 1 second
	SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
	JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));

	JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());

	msgDataStream.print();
	// Create JavaRDD<Row>
	msgDataStream.foreachRDD(new RowProcessor());	

	jssc.start();
	try {
		jssc.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 7
Project: nats-connector-spark   File: SparkToNatsConnectorPool.java   Source Code and License Vote up 6 votes
/**
 * @param stream, the Spark Stream to publish to NATS
 * @param dataEncoder, the function used to encode the Spark Stream Records into the NATS Message Payloads
 */
public <V extends Object> void publishToNats(final JavaDStream<V> stream, final Function<V, byte[]> dataEncoder) {
	logger.trace("publishToNats(JavaDStream<String> stream)");
	stream.foreachRDD((VoidFunction<JavaRDD<V>>) rdd -> {
		logger.trace("stream.foreachRDD");
		rdd.foreachPartitionAsync(objects -> {
			logger.trace("rdd.foreachPartition");
			final SparkToNatsConnector<?> connector = getConnector();
			while(objects.hasNext()) {
				final V obj = objects.next();
				logger.trace("Will publish {}", obj);
				connector.publishToNats(dataEncoder.apply(obj));
			}
			returnConnector(connector);  // return to the pool for future reuse
		});
	});
}
 
Example 8
Project: kite-apps   File: KafkaOutput.java   Source Code and License Vote up 6 votes
/**
 * Writes the content of the stream to the Kafka topic
 * behind this producer.
 */
@edu.umd.cs.findbugs.annotations.SuppressWarnings(
    value="SE_INNER_CLASS", justification="Uses state from outer class.")
public void write (JavaDStream<T> stream) {

  stream.foreachRDD(new Function<JavaRDD<T>, Void>() {
    @Override
    public Void call(JavaRDD<T> rdd) throws Exception {

      write(rdd);

      return null;
    }
  });
}
 
Example 9
Project: Apache-Spark-2x-for-Java-Developers   File: FileStreamingEx.java   Source Code and License Vote up 5 votes
public static void main(String[] args) {
   	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
   	//Logger rootLogger = LogManager.getRootLogger();
  		//rootLogger.setLevel(Level.WARN); 
       SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
       String inputDirectory="E:\\hadoop\\streamFolder\\";
    
       JavaSparkContext sc = new JavaSparkContext(conf);
       JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1));
      // streamingContext.checkpoint("E:\\hadoop\\checkpoint");
       Logger rootLogger = LogManager.getRootLogger();
  		rootLogger.setLevel(Level.WARN); 
  		
  		JavaDStream<String> streamfile = streamingContext.textFileStream(inputDirectory);
  		streamfile.print();
  		streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x)));
  		
  			   		
  		JavaPairDStream<LongWritable, Text> streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class);
  	 streamedFile.print();
  		
  	 streamingContext.start();
  	 

       try {
		streamingContext.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 10
Project: gcp   File: Spark3Kafka.java   Source Code and License Vote up 5 votes
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Kafka");
  
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    
    JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(
        jsc, ZK_HOST_PORT, "a_group_id", Collections.singletonMap(EXAMPLE_TOPIC, 1));

    JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 11
Project: nats-connector-spark   File: AbstractNatsToSparkTest.java   Source Code and License Vote up 5 votes
protected void validateTheReceptionOfMessages(JavaStreamingContext ssc,
		JavaReceiverInputDStream<String> stream) throws InterruptedException {
	JavaDStream<String> messages = stream.repartition(3);

	ExecutorService executor = Executors.newFixedThreadPool(6);

	final int nbOfMessages = 5;
	NatsPublisher np = getNatsPublisher(nbOfMessages);
	
	if (logger.isDebugEnabled()) {
		messages.print();
	}
	
	messages.foreachRDD(new VoidFunction<JavaRDD<String>>() {
		private static final long serialVersionUID = 1L;

		@Override
		public void call(JavaRDD<String> rdd) throws Exception {
			logger.debug("RDD received: {}", rdd.collect());
			
			final long count = rdd.count();
			if ((count != 0) && (count != nbOfMessages)) {
				rightNumber = false;
				logger.error("The number of messages received should have been {} instead of {}.", nbOfMessages, count);
			}
			
			TOTAL_COUNT.getAndAdd((int) count);
			
			atLeastSomeData = atLeastSomeData || (count > 0);
			
			for (String str :rdd.collect()) {
				if (! str.startsWith(NatsPublisher.NATS_PAYLOAD)) {
						payload = str;
					}
			}
		}			
	});
	
	closeTheValidation(ssc, executor, nbOfMessages, np);		
}
 
Example 12
Project: nats-connector-spark   File: AbstractNatsToSparkTest.java   Source Code and License Vote up 5 votes
protected void validateTheReceptionOfIntegerMessages(JavaStreamingContext ssc, 
		JavaReceiverInputDStream<Integer> stream) throws InterruptedException {
	JavaDStream<Integer> messages = stream.repartition(3);

	ExecutorService executor = Executors.newFixedThreadPool(6);

	final int nbOfMessages = 5;
	NatsPublisher np = getNatsPublisher(nbOfMessages);
	
	if (logger.isDebugEnabled()) {
		messages.print();
	}
	
	messages.foreachRDD(new VoidFunction<JavaRDD<Integer>>() {
		private static final long serialVersionUID = 1L;

		@Override
		public void call(JavaRDD<Integer> rdd) throws Exception {
			logger.debug("RDD received: {}", rdd.collect());
			
			final long count = rdd.count();
			if ((count != 0) && (count != nbOfMessages)) {
				rightNumber = false;
				logger.error("The number of messages received should have been {} instead of {}.", nbOfMessages, count);
			}
			
			TOTAL_COUNT.getAndAdd((int) count);
			
			atLeastSomeData = atLeastSomeData || (count > 0);
			
			for (Integer value :rdd.collect()) {
				if (value < NatsPublisher.NATS_PAYLOAD_INT) {
						payload = value.toString();
					}
			}
		}			
	});
	
	closeTheValidation(ssc, executor, nbOfMessages, np);
}
 
Example 13
Project: StreamBench   File: StreamKMeans.java   Source Code and License Vote up 5 votes
public static void main(String[] args) {

//        String inputFile = StreamKMeans.class.getClassLoader().getResource("centroids.txt").getFile();
        SparkConf sparkConf = new SparkConf().setMaster("spark://master:7077").setAppName("JavaKMeans");

        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(1000));

        HashSet<String> topicsSet = new HashSet<>();
        topicsSet.add("KMeans");
        HashMap<String, String> kafkaParams = new HashMap<>();
//        kafkaParams.put("metadata.broker.list", "kafka1:9092,kafka2:9092,kafka3:9092");
        kafkaParams.put("metadata.broker.list", "localhost:9092");
        kafkaParams.put("auto.offset.reset", "largest");
        kafkaParams.put("zookeeper.connect", "zoo1:2181");
        kafkaParams.put("group.id", "spark");

        // Create direct kafka stream with brokers and topics
        JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(
                jssc,
                String.class,
                String.class,
                StringDecoder.class,
                StringDecoder.class,
                kafkaParams,
                topicsSet
        );

        JavaDStream<Vector> points = lines.map(new ParseKafkaString()).map(new ParsePoint());

        Vector[] initCentroids = loadInitCentroids();
        double[] weights = new double[96];
        for (int i = 0; i < 96; i++) {
            weights[i] = 1.0 / 96;
        }

        final StreamingKMeans model = new StreamingKMeans()
                .setK(96)
                .setDecayFactor(0)
                .setInitialCenters(initCentroids, weights);

        model.trainOn(points);

        points.foreachRDD(new Function2<JavaRDD<Vector>, Time, Void>() {
            @Override
            public Void call(JavaRDD<Vector> vectorJavaRDD, Time time) throws Exception {
                Vector[] vector = model.latestModel().clusterCenters();
                for (int i = 0; i < vector.length; i++) {
                    logger.warn(vector[i].toArray()[0] + "\t" + vector[i].toArray()[1]);
                }
                return null;
            }
        });

        jssc.addStreamingListener(new PerformanceStreamingListener());
        jssc.start();
        jssc.awaitTermination();
    }
 
Example 14
Project: kite-apps   File: SparkDatasets.java   Source Code and License Vote up 5 votes
/**
 * Save all RDDs in the given DStream to the given view.
 * @param dstream
 * @param view
 */
public static <T> void save(JavaDStream<T> dstream, final View<T> view) {

  final String uri = view.getUri().toString();

  dstream.foreachRDD(new Function2<JavaRDD<T>, Time, Void>() {
    @Override
    public Void call(JavaRDD<T> rdd, Time time) throws Exception {

      save(rdd, uri);

      return null;
    }
  });
}
 
Example 15
Project: Decision   File: StreamingContextConfiguration.java   Source Code and License Vote up 5 votes
private void configureDataContext(JavaStreamingContext context) {
    Map<String, Integer> baseTopicMap = new HashMap<>();


    configurationContext.getDataTopics().forEach( dataTopic -> baseTopicMap.put(dataTopic, 1));

    kafkaTopicService.createTopicsIfNotExist(configurationContext.getDataTopics(), configurationContext
            .getKafkaReplicationFactor(), configurationContext.getKafkaPartitions());

    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath());
    kafkaParams.put("group.id", configurationContext.getGroupId());
     /*
     groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of
      the group.
     Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the
     same topic with the same groupId, only one instance will be able to read from the topic
     */
    JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class,
            kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap,
            StorageLevel.MEMORY_AND_DISK_SER());

    AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction();
    JavaDStream<StratioStreamingMessage>  insertRequests = messages.filter(
            new FilterAvroMessagesByOperationFunction(STREAM_OPERATIONS.MANIPULATION.INSERT))
            .map(avroDeserializeMessageFunction);

    InsertIntoStreamFunction insertIntoStreamFunction = new InsertIntoStreamFunction(streamOperationService,
            configurationContext.getZookeeperHostsQuorum());
    insertRequests.foreachRDD(insertIntoStreamFunction);

}
 
Example 16
Project: Test_Projects   File: StreamingKafka101.java   Source Code and License Vote up 5 votes
public static void main(String[] args) {
Logger.getLogger("org").setLevel(Level.WARN);
Logger.getLogger("akka").setLevel(Level.WARN);

SparkConf sparkConf = new SparkConf().setMaster("spark://10.204.100.206:7077").setAppName("StreamingKafka101");
sparkConf.setJars(new String[] { "target\\TestProjects-1.0-SNAPSHOT.jar" });
	
//sparkConf.setExecutorEnv("executor-memory", "8G");
//sparkConf.setExecutorEnv("spark.executor.memory", "8G");
sparkConf.set("spark.executor.memory", "4G");
//sparkConf.set("executor-memory", "8G");
		
int duration = 2;
if(args.length > 0){
 try{
  duration = Integer.parseInt(args[0]);
  System.out.println("duration changed to " + duration);
 }catch(Exception e){
  System.out.println("Duration reset to defaults");
 }
}

JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(duration));
	
  
Map<String, Integer> topicMap = new HashMap<String, Integer>();
topicMap.put("loadtest", 4);
JavaPairReceiverInputDStream<String, String> kafkaStream = KafkaUtils.createStream(ssc,"10.204.100.172:2182","kafka-group1",topicMap);
  
JavaDStream<String> lines = kafkaStream.map(new Function<Tuple2<String, String>, String>() {
    @Override
    public String call(Tuple2<String, String> tuple2) {
      return tuple2._2();
    }
});
  
lines.foreachRDD(new Function<JavaRDD<String>, Void>() {
 @Override
 public Void call(JavaRDD<String> rdd) throws Exception {
  System.out.println(new Date() + "  Total records read: " + rdd.count() );
  return null;
 }
});
  	
ssc.start();
ssc.awaitTermination();
}
 
Example 17
Project: kafka-spark-consumer   File: ProcessedOffsetManager.java   Source Code and License Vote up 5 votes
@SuppressWarnings("deprecation")
public static void persists(DStream<Tuple2<Integer, Iterable<Long>>> partitonOffset, Properties props) {
  ClassTag<Tuple2<Integer, Iterable<Long>>> tuple2ClassTag = 
      ScalaUtil.<Integer, Iterable<Long>>getTuple2ClassTag();
  JavaDStream<Tuple2<Integer, Iterable<Long>>> jpartitonOffset = 
      new JavaDStream<Tuple2<Integer, Iterable<Long>>>(partitonOffset, tuple2ClassTag);
  jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<Integer, Iterable<Long>>>>() {
    @Override
    public void call(JavaRDD<Tuple2<Integer, Iterable<Long>>> po) throws Exception {
      List<Tuple2<Integer, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}
 
Example 18
Project: spring-xd-samples   File: Logger.java   Source Code and License Vote up 5 votes
@Override
public JavaDStream<String> process(JavaDStream<String> input) {
	input.foreachRDD(rdd -> {
		rdd.foreachPartition(
				items -> {
					while (items.hasNext()) {
						System.out.println(items.next() + System.lineSeparator());
					}
				});
		return null;
	});
	return null;
}
 
Example 19
Project: Apache-Spark-2x-for-Java-Developers   File: KafkaExample.java   Source Code and License Vote up 4 votes
public static void main(String[] args) {
  	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
 System.setProperty("hadoop.home.dir", "E:\\hadoop");
  	//Logger rootLogger = LogManager.getRootLogger();
 		//rootLogger.setLevel(Level.WARN); 
      SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");    
      JavaSparkContext sc = new JavaSparkContext(conf);
      JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
      streamingContext.checkpoint("E:\\hadoop\\checkpoint");
      Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
      Map<String, Object> kafkaParams = new HashMap<>();
      kafkaParams.put("bootstrap.servers", "10.0.75.1:9092");
      kafkaParams.put("key.deserializer", StringDeserializer.class);
      kafkaParams.put("value.deserializer", StringDeserializer.class);
      kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea");
      kafkaParams.put("auto.offset.reset", "latest");
     // kafkaParams.put("enable.auto.commit", false);

      Collection<String> topics = Arrays.asList("mytopic", "anothertopic");

      final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(),
      				ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));

      JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value()));
     
      pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));});
     
      JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText());
      
      tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x)));
      
     JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() );
 
      hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x)));
      
      JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue();
      
      cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30))
                .countByValue()
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30))
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2())));
      */
     hashtagRDD.window(Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     
     /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     
      streamingContext.start();
      try {
	streamingContext.awaitTermination();
} catch (InterruptedException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
  }
 
Example 20
Project: Sparkathon   File: SQLonStreams.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws Exception {
    Logger.getLogger("org").setLevel(Level.WARN);
    Logger.getLogger("akka").setLevel(Level.WARN);

    final Pattern SPACE = Pattern.compile(" ");

    SparkConf conf = new SparkConf().setAppName("Big Apple").setMaster("local[2]");
    JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(1));

    JavaDStream<String> lines = ssc.textFileStream("src/main/resources/stream");
    lines.print();

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Lists.newArrayList(SPACE.split(x)).iterator();
        }
    });

    words.foreachRDD(
            new VoidFunction2<JavaRDD<String>, Time>() {
                @Override
                public void call(JavaRDD<String> rdd, Time time) {

                    // Get the singleton instance of SQLContext
                    SQLContext sqlContext = SQLContext.getOrCreate(rdd.context());

                    // Convert RDD[String] to RDD[case class] to Dataset
                    JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() {
                        public JavaRecord call(String word) {
                            JavaRecord record = new JavaRecord();
                            record.setWord(word);
                            return record;
                        }
                    });
                    Dataset<Row> wordsDataset = sqlContext.createDataFrame(rowRDD, JavaRecord.class);

                    // Register as table
                    wordsDataset.registerTempTable("words");

                    // Do word count on table using SQL and print it
                    Dataset wordCountsDataset =
                            sqlContext.sql("select word, count(*) as total from words group by word");
                    wordCountsDataset.show();
                }
            }
    );


    ssc.start();
    ssc.awaitTermination();

}
 
Example 21
Project: envelope   File: Runner.java   Source Code and License Vote up 4 votes
/**
 * Run the Envelope pipeline as a Spark Streaming job.
 * @param steps The full configuration of the Envelope pipeline
 */
@SuppressWarnings("unchecked")
private static void runStreaming(final Set<Step> steps) throws Exception {
  final Set<Step> independentNonStreamingSteps = StepUtils.getIndependentNonStreamingSteps(steps);
  runBatch(independentNonStreamingSteps);

  Set<StreamingStep> streamingSteps = StepUtils.getStreamingSteps(steps);
  for (final StreamingStep streamingStep : streamingSteps) {
    LOG.debug("Setting up streaming step: " + streamingStep.getName());

    @SuppressWarnings("rawtypes")
    JavaDStream stream = streamingStep.getStream();

    final StructType streamSchema = streamingStep.getSchema();
    LOG.debug("Stream schema: " + streamSchema);

    stream.foreachRDD(new VoidFunction<JavaRDD<?>>() {
      @Override
      public void call(JavaRDD<?> raw) throws Exception {
        // Some independent steps might be repeating steps that have been flagged for reload
        StepUtils.resetRepeatingSteps(steps);
        // This will run any batch steps (and dependents) that are not submitted
        runBatch(independentNonStreamingSteps);
        
        streamingStep.stageProgress(raw);
        
        JavaRDD<Row> translated = streamingStep.translate(raw);
        
        Dataset<Row> batchDF = Contexts.getSparkSession().createDataFrame(translated, streamSchema);
        streamingStep.setData(batchDF);
        streamingStep.setSubmitted(true);

        Set<Step> allDependentSteps = StepUtils.getAllDependentSteps(streamingStep, steps);
        runBatch(allDependentSteps);

        StepUtils.resetDataSteps(allDependentSteps);
        
        streamingStep.recordProgress();
      }
    });

    LOG.debug("Finished setting up streaming step: " + streamingStep.getName());
  }

  JavaStreamingContext jsc = Contexts.getJavaStreamingContext();
  jsc.start();
  LOG.debug("Streaming context started");
  jsc.awaitTermination();
  LOG.debug("Streaming context terminated");
}
 
Example 22
Project: net.jgp.labs.spark   File: StreamingIngestionFileSystemTextFileToDataframeApp.java   Source Code and License Vote up 4 votes
private void start() {
	// Create a local StreamingContext with two working thread and batch interval of
	// 1 second
	SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
	JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));

	JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());

	msgDataStream.print();
	// Create JavaRDD<Row>
	msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
		private static final long serialVersionUID = -590010339928376829L;

		@Override
		public void call(JavaRDD<String> rdd) {
			JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
				private static final long serialVersionUID = 5167089361335095997L;

				@Override
				public Row call(String msg) {
					Row row = RowFactory.create(msg);
					return row;
				}
			});
			// Create Schema
			StructType schema = DataTypes.createStructType(
					new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
			
			// Get Spark 2.0 session
			SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
			Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
			msgDataFrame.show();
		}
	});

	jssc.start();
	try {
		jssc.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 23
Project: iote2e   File: Iote2eRequestSparkConsumer.java   Source Code and License Vote up 4 votes
/**
   * Process.
   *
   * @param masterConfig the master config
   * @throws Exception the exception
   */
  public void process(MasterConfig masterConfig) throws Exception {
  	logger.info(masterConfig.toString());
  	String sparkAppName = masterConfig.getSparkAppName();
  	String sparkMaster = masterConfig.getSparkMaster();
  	Integer kafkaConsumerNumThreads = masterConfig.getKafkaConsumerNumThreads();
  	Integer sparkStreamDurationMs = masterConfig.getSparkStreamDurationMs();
  	String kafkaGroup = masterConfig.getKafkaGroup();
  	String kafkaTopic = masterConfig.getKafkaTopic();
  	String kafkaZookeeperHosts = masterConfig.getKafkaZookeeperHosts();
  	Integer kafkaZookeeperPort = masterConfig.getKafkaZookeeperPort();
  	String kafkaZookeeperBrokerPath = masterConfig.getKafkaZookeeperBrokerPath();
  	String kafkaConsumerId = masterConfig.getKafkaConsumerId();
  	String kafkaZookeeperConsumerConnection = masterConfig.getKafkaZookeeperConsumerConnection();
  	String kafkaZookeeperConsumerPath = masterConfig.getKafkaZookeeperConsumerPath();

      conf = new SparkConf()
              .setAppName(sparkAppName);
      if( sparkMaster != null && sparkMaster.length() > 0 ) conf.setMaster( sparkMaster );
      ssc = new JavaStreamingContext(conf, new Duration(sparkStreamDurationMs));

      Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
      topicCountMap.put(kafkaTopic, new Integer(kafkaConsumerNumThreads));
      Properties kafkaProps = new Properties();
      kafkaProps.put("group.id", kafkaGroup);
      // Spark Kafka Consumer https://github.com/dibbhatt/kafka-spark-consumer
      kafkaProps.put("zookeeper.hosts", kafkaZookeeperHosts);
      kafkaProps.put("zookeeper.port", String.valueOf(kafkaZookeeperPort) );
      kafkaProps.put("zookeeper.broker.path", kafkaZookeeperBrokerPath );
      kafkaProps.put("kafka.topic", kafkaTopic);
      kafkaProps.put("kafka.consumer.id", kafkaConsumerId );
      kafkaProps.put("zookeeper.consumer.connection", kafkaZookeeperConsumerConnection);
      kafkaProps.put("zookeeper.consumer.path", kafkaZookeeperConsumerPath);
      // consumer optional 
      kafkaProps.put("consumer.forcefromstart", "false");
      kafkaProps.put("consumer.fetchsizebytes", "1048576");
      kafkaProps.put("consumer.fillfreqms", "200" );
      // kafkaProps.put("consumer.fillfreqms", String.valueOf(sparkStreamDurationMs) );
      kafkaProps.put("consumer.backpressure.enabled", "true");
      //kafkaProps.put("consumer.num_fetch_to_buffer", "10");
              
      kafkaProps.put( Config.KAFKA_PARTITIONS_NUMBER, 4 );
      
      kafkaProps.put("zookeeper.session.timeout.ms", "400");
      kafkaProps.put("zookeeper.sync.time.ms", "200");
      kafkaProps.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
      kafkaProps.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");

      Iote2eRequestSparkProcessor streamProcessor = new Iote2eRequestSparkProcessor(masterConfig);
      
      int numberOfReceivers = 6;	
      
try {
	JavaDStream<MessageAndMetadata> unionStreams = ReceiverLauncher.launch(
			ssc, kafkaProps, numberOfReceivers, StorageLevel.MEMORY_ONLY());		
	unionStreams.foreachRDD(streamProcessor::processIote2eRequestRDD);
	logger.info("Starting Iote2eRequestSparkConsumer");
	ssc.start();
} catch( Exception e ) {
	logger.error(e.getMessage(),e);
	System.exit(8);
}

try {
	logger.info("Started Iote2eRequestSparkConsumer");
	started = true;
	ssc.awaitTermination();
   	logger.info("Stopped Spark");
} catch( InterruptedException e1 ) {
	logger.warn(e1.getMessage());
} catch( Exception e2 ) {
	logger.error(e2.getMessage(),e2);
	System.exit(8);
}

  }
 
Example 24
Project: iote2e   File: BdbbSparkConsumer.java   Source Code and License Vote up 4 votes
/**
   * Process.
   *
   * @param masterConfig the master config
   * @throws Exception the exception
   */
  public void process(MasterConfig masterConfig) throws Exception {
  	logger.info(masterConfig.toString());
  	String sparkAppNameBdbb = masterConfig.getSparkAppNameBdbb();
  	String sparkMaster = masterConfig.getSparkMaster();
  	Integer kafkaConsumerNumThreads = masterConfig.getKafkaConsumerNumThreads();
  	Integer sparkStreamDurationMs = masterConfig.getSparkStreamDurationMs();
  	String kafkaGroupBdbb = masterConfig.getKafkaGroupBdbb();
  	String kafkaTopicBdbb = masterConfig.getKafkaTopicBdbb();
  	String kafkaZookeeperHosts = masterConfig.getKafkaZookeeperHosts();
  	Integer kafkaZookeeperPort = masterConfig.getKafkaZookeeperPort();
  	String kafkaZookeeperBrokerPath = masterConfig.getKafkaZookeeperBrokerPath();
  	String kafkaConsumerId = masterConfig.getKafkaConsumerId();
  	String kafkaZookeeperConsumerConnection = masterConfig.getKafkaZookeeperConsumerConnection();
  	String kafkaZookeeperConsumerPath = masterConfig.getKafkaZookeeperConsumerPath();

      conf = new SparkConf()
              .setAppName(sparkAppNameBdbb);
      if( sparkMaster != null && sparkMaster.length() > 0 ) conf.setMaster( sparkMaster );
      ssc = new JavaStreamingContext(conf, new Duration(sparkStreamDurationMs));

      Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
      topicCountMap.put(kafkaTopicBdbb, new Integer(kafkaConsumerNumThreads));
      Properties kafkaProps = new Properties();
      kafkaProps.put("group.id", kafkaGroupBdbb);
      // Spark Kafka Consumer https://github.com/dibbhatt/kafka-spark-consumer
      kafkaProps.put("zookeeper.hosts", kafkaZookeeperHosts);
      kafkaProps.put("zookeeper.port", String.valueOf(kafkaZookeeperPort) );
      kafkaProps.put("zookeeper.broker.path", kafkaZookeeperBrokerPath );
      kafkaProps.put("kafka.topic", kafkaTopicBdbb);
      kafkaProps.put("kafka.consumer.id", kafkaConsumerId );
      kafkaProps.put("zookeeper.consumer.connection", kafkaZookeeperConsumerConnection);
      kafkaProps.put("zookeeper.consumer.path", kafkaZookeeperConsumerPath);
      // consumer optional 
      kafkaProps.put("consumer.forcefromstart", "false");
      kafkaProps.put("consumer.fetchsizebytes", "1048576");
      kafkaProps.put("consumer.fillfreqms", "200" );
      // kafkaProps.put("consumer.fillfreqms", String.valueOf(sparkStreamDurationMs) );
      kafkaProps.put("consumer.backpressure.enabled", "true");
      //kafkaProps.put("consumer.num_fetch_to_buffer", "10");
              
      kafkaProps.put( Config.KAFKA_PARTITIONS_NUMBER, 4 );
      
      kafkaProps.put("zookeeper.session.timeout.ms", "400");
      kafkaProps.put("zookeeper.sync.time.ms", "200");
      kafkaProps.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
      kafkaProps.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");

      BdbbSparkProcessor streamProcessor = new BdbbSparkProcessor(masterConfig);
      
      int numberOfReceivers = 6;	
      
try {
	JavaDStream<MessageAndMetadata> unionStreams = ReceiverLauncher.launch(
			ssc, kafkaProps, numberOfReceivers, StorageLevel.MEMORY_ONLY());		
	unionStreams.foreachRDD(streamProcessor::processBdbbRDD);
	logger.info("Starting BdbbSparkConsumer");
	ssc.start();
} catch( Exception e ) {
	logger.error(e.getMessage(),e);
	System.exit(8);
}

try {
	logger.info("Started BdbbSparkConsumer");
	started = true;
	ssc.awaitTermination();
   	logger.info("Stopped Spark");
} catch( InterruptedException e1 ) {
	logger.warn(e1.getMessage());
} catch( Exception e2 ) {
	logger.error(e2.getMessage(),e2);
	System.exit(8);
}

  }
 
Example 25
Project: iote2e   File: OmhSparkConsumer.java   Source Code and License Vote up 4 votes
/**
   * Process.
   *
   * @param masterConfig the master config
   * @throws Exception the exception
   */
  public void process(MasterConfig masterConfig) throws Exception {
  	logger.info(masterConfig.toString());
  	String sparkAppNameOmh = masterConfig.getSparkAppNameOmh();
  	String sparkMaster = masterConfig.getSparkMaster();
  	Integer kafkaConsumerNumThreads = masterConfig.getKafkaConsumerNumThreads();
  	Integer sparkStreamDurationMs = masterConfig.getSparkStreamDurationMs();
  	String kafkaGroupOmh = masterConfig.getKafkaGroupOmh();
  	String kafkaTopicOmh = masterConfig.getKafkaTopicOmh();
  	String kafkaZookeeperHosts = masterConfig.getKafkaZookeeperHosts();
  	Integer kafkaZookeeperPort = masterConfig.getKafkaZookeeperPort();
  	String kafkaZookeeperBrokerPath = masterConfig.getKafkaZookeeperBrokerPath();
  	String kafkaConsumerId = masterConfig.getKafkaConsumerId();
  	String kafkaZookeeperConsumerConnection = masterConfig.getKafkaZookeeperConsumerConnection();
  	String kafkaZookeeperConsumerPath = masterConfig.getKafkaZookeeperConsumerPath();

      conf = new SparkConf()
              .setAppName(sparkAppNameOmh);
      if( sparkMaster != null && sparkMaster.length() > 0 ) conf.setMaster( sparkMaster );
      ssc = new JavaStreamingContext(conf, new Duration(sparkStreamDurationMs));

      Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
      topicCountMap.put(kafkaTopicOmh, new Integer(kafkaConsumerNumThreads));
      Properties kafkaProps = new Properties();
      kafkaProps.put("group.id", kafkaGroupOmh);
      // Spark Kafka Consumer https://github.com/dibbhatt/kafka-spark-consumer
      kafkaProps.put("zookeeper.hosts", kafkaZookeeperHosts);
      kafkaProps.put("zookeeper.port", String.valueOf(kafkaZookeeperPort) );
      kafkaProps.put("zookeeper.broker.path", kafkaZookeeperBrokerPath );
      kafkaProps.put("kafka.topic", kafkaTopicOmh);
      kafkaProps.put("kafka.consumer.id", kafkaConsumerId );
      kafkaProps.put("zookeeper.consumer.connection", kafkaZookeeperConsumerConnection);
      kafkaProps.put("zookeeper.consumer.path", kafkaZookeeperConsumerPath);
      // consumer optional 
      kafkaProps.put("consumer.forcefromstart", "false");
      kafkaProps.put("consumer.fetchsizebytes", "1048576");
      kafkaProps.put("consumer.fillfreqms", "200" );
      // kafkaProps.put("consumer.fillfreqms", String.valueOf(sparkStreamDurationMs) );
      kafkaProps.put("consumer.backpressure.enabled", "true");
      //kafkaProps.put("consumer.num_fetch_to_buffer", "10");
              
      kafkaProps.put( Config.KAFKA_PARTITIONS_NUMBER, 4 );
      
      kafkaProps.put("zookeeper.session.timeout.ms", "400");
      kafkaProps.put("zookeeper.sync.time.ms", "200");
      kafkaProps.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
      kafkaProps.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");

      OmhSparkProcessor streamProcessor = new OmhSparkProcessor(masterConfig);
      
      int numberOfReceivers = 6;	
      
try {
	JavaDStream<MessageAndMetadata> unionStreams = ReceiverLauncher.launch(
			ssc, kafkaProps, numberOfReceivers, StorageLevel.MEMORY_ONLY());		
	unionStreams.foreachRDD(streamProcessor::processOmhRDD);
	logger.info("Starting OmhSparkConsumer");
	ssc.start();
} catch( Exception e ) {
	logger.error(e.getMessage(),e);
	System.exit(8);
}

try {
	logger.info("Started OmhSparkConsumer");
	started = true;
	ssc.awaitTermination();
   	logger.info("Stopped Spark");
} catch( InterruptedException e1 ) {
	logger.warn(e1.getMessage());
} catch( Exception e2 ) {
	logger.error(e2.getMessage(),e2);
	System.exit(8);
}

  }
 
Example 26
Project: kafka-examples   File: StreamingAvg.java   Source Code and License Vote up 4 votes
public static void main(String[] args) {
    if (args.length < 4) {
        System.err.println("Usage: StreamingAvg <zkQuorum> <group> <topics> <numThreads>");
        System.exit(1);
    }

    //Configure the Streaming Context
    SparkConf sparkConf = new SparkConf().setAppName("StreamingAvg");

    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(10000));

    int numThreads = Integer.parseInt(args[3]);
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    String[] topics = args[2].split(",");
    for (String topic: topics) {
        topicMap.put(topic, numThreads);
    }

    JavaPairReceiverInputDStream<String, String> messages =
            KafkaUtils.createStream(ssc, args[0], args[1], topicMap);


    System.out.println("Got my DStream! connecting to zookeeper "+ args[0] + " group " + args[1] + " topics" +
    topicMap);



    JavaPairDStream<Integer,Integer> nums = messages.mapToPair(new PairFunction<Tuple2<String,String>, Integer, Integer>()
    {
        @Override
        public Tuple2<Integer,Integer> call(Tuple2<String, String> tuple2) {
            return new Tuple2<Integer,Integer>(1,Integer.parseInt(tuple2._2()));
        }
    });

    JavaDStream<Tuple2<Integer,Integer>> countAndSum = nums.reduce(new Function2<Tuple2<Integer,Integer>, Tuple2<Integer,Integer>, Tuple2<Integer,Integer>>() {
        @Override
        public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> a, Tuple2<Integer, Integer> b) {
            return new Tuple2<Integer, Integer>(a._1() + b._1(), a._2() + b._2());
        }
    });

    countAndSum.foreachRDD(new Function<JavaRDD<Tuple2<Integer, Integer>>, Void>() {
        @Override
        public Void call(JavaRDD<Tuple2<Integer, Integer>> tuple2JavaRDD) throws Exception {
            if (tuple2JavaRDD.count() > 0) {
                System.out.println("Current avg: " + tuple2JavaRDD.first()._2() / tuple2JavaRDD.first()._1());
            } else {
                System.out.println("Got no data in this window");
            }
            return null;
        }
    });

    ssc.start();
    ssc.awaitTermination();

}
 
Example 27
Project: learning-spark-examples   File: LogAnalyzerAppMain.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {
  Flags.setFromCommandLineArgs(THE_OPTIONS, args);

  // Startup the Spark Conf.
  SparkConf conf = new SparkConf()
      .setAppName("A Databricks Reference Application: Logs Analysis with Spark");
  JavaStreamingContext jssc = new JavaStreamingContext(conf,
      Flags.getInstance().getSlideInterval());

  // Checkpointing must be enabled to use the updateStateByKey function & windowed operations.
  jssc.checkpoint(Flags.getInstance().getCheckpointDirectory());

  // This methods monitors a directory for new files to read in for streaming.
  JavaDStream<String> logData = jssc.textFileStream(Flags.getInstance().getLogsDirectory());

  JavaDStream<ApacheAccessLog> accessLogsDStream
    = logData.map(new Functions.ParseFromLogLine()).cache();

  final LogAnalyzerTotal logAnalyzerTotal = new LogAnalyzerTotal();
  final LogAnalyzerWindowed logAnalyzerWindowed = new LogAnalyzerWindowed();

  // Process the DStream which gathers stats for all of time.
  logAnalyzerTotal.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);

  // Calculate statistics for the last time interval.
  logAnalyzerWindowed.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);

  // Render the output each time there is a new RDD in the accessLogsDStream.
  final Renderer renderer = new Renderer();
  accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
      public Void call(JavaRDD<ApacheAccessLog> rdd) {
        // Call this to output the stats.
        try {
          renderer.render(logAnalyzerTotal.getLogStatistics(),
                          logAnalyzerWindowed.getLogStatistics());
        } catch (Exception e) {
        }
        return null;
      }
    });

  // Start the streaming server.
  jssc.start();              // Start the computation
  jssc.awaitTermination();   // Wait for the computation to terminate
}
 
Example 28
Project: Test_Projects   File: StreamingKafkaDirect101.java   Source Code and License Vote up 4 votes
public static void main(String[] args) {
Logger.getLogger("org").setLevel(Level.WARN);
Logger.getLogger("akka").setLevel(Level.WARN);

SparkConf sparkConf = new SparkConf().setMaster("spark://10.204.100.206:7077").setAppName("StreamingKafkaDirect101");

//Only for running from eclipse
if(System.getProperty("dev") != null)
 sparkConf.setJars(new String[] { "target\\TestProjects-1.0-SNAPSHOT.jar" });
	
//sparkConf.setExecutorEnv("executor-memory", "8G");
//sparkConf.setExecutorEnv("spark.executor.memory", "8G");
sparkConf.set("spark.executor.memory", "4G");
//sparkConf.set("executor-memory", "8G");

int duration = 2;
if(args.length > 0){
 try{
  duration = Integer.parseInt(args[0]);
  System.out.println("duration changed to " + duration);
 }catch(Exception e){
  System.out.println("Duration reset to defaults");
 }
}
		
JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(duration));
	
  
HashSet<String> topicsSet = new HashSet<String>();
topicsSet.add("loadtest");

HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", "10.204.100.180:19092");

JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
  ssc,
       String.class,
       String.class,
       StringDecoder.class,
       StringDecoder.class,
       kafkaParams,
       topicsSet
   );


JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
    @Override
    public String call(Tuple2<String, String> tuple2) {
      return tuple2._2();
    }
});
  
lines.foreachRDD(new Function<JavaRDD<String>, Void>() {
 @Override
 public Void call(JavaRDD<String> rdd) throws Exception {
  System.out.println(new Date() + "  Total records read: " + rdd.count() );
  return null;
 }
});
  	
ssc.start();
ssc.awaitTermination();
}
 
Example 29
Project: kafka-spark-consumer   File: SampleConsumer.java   Source Code and License Vote up 4 votes
@SuppressWarnings("deprecation")
private void run() {

  Properties props = new Properties();
  props.put("zookeeper.hosts", "localhost");
  props.put("zookeeper.port", "2181");
  props.put("kafka.topic", "mytopic");
  props.put("kafka.consumer.id", "kafka-consumer");
  // Optional Properties
  // Optional Properties
  props.put("consumer.forcefromstart", "true");
  props.put("max.poll.records", "100");
  props.put("consumer.fillfreqms", "1000");
  props.put("consumer.backpressure.enabled", "true");
  //Kafka properties
  props.put("bootstrap.servers", "localhost:9093");
  props.put("security.protocol", "SSL");
  props.put("ssl.truststore.location","~/kafka-securitykafka.server.truststore.jks");
  props.put("ssl.truststore.password", "test1234");

  SparkConf _sparkConf = new SparkConf();
  JavaStreamingContext jsc = new JavaStreamingContext(_sparkConf, Durations.seconds(30));
  // Specify number of Receivers you need.
  int numberOfReceivers = 1;

  JavaDStream<MessageAndMetadata<byte[]>> unionStreams = ReceiverLauncher.launch(
      jsc, props, numberOfReceivers, StorageLevel.MEMORY_ONLY());

  //Get the Max offset from each RDD Partitions. Each RDD Partition belongs to One Kafka Partition
  JavaPairDStream<Integer, Iterable<Long>> partitonOffset = ProcessedOffsetManager
      .getPartitionOffset(unionStreams, props);
  

  //Start Application Logic
  unionStreams.foreachRDD(new VoidFunction<JavaRDD<MessageAndMetadata<byte[]>>>() {
    @Override
    public void call(JavaRDD<MessageAndMetadata<byte[]>> rdd) throws Exception {

  	rdd.foreachPartition(new VoidFunction<Iterator<MessageAndMetadata<byte[]>>>() {
	
	@Override
	public void call(Iterator<MessageAndMetadata<byte[]>> mmItr) throws Exception {
		while(mmItr.hasNext()) {
			MessageAndMetadata<byte[]> mm = mmItr.next();
			byte[] key = mm.getKey();
			byte[] value = mm.getPayload();
			Headers headers = mm.getHeaders();
			System.out.println("Key :" + new String(key) + " Value :" + new String(value));
			if(headers != null) {
				Header[] harry = headers.toArray();
				for(Header header : harry) {
					String hkey = header.key();
					byte[] hvalue = header.value();
					System.out.println("Header Key :" + hkey + " Header Value :" + new String(hvalue));
				}
			}
			
		}
		
	}
});
    }
  });
  //End Application Logic

  //Persists the Max Offset of given Kafka Partition to ZK
  ProcessedOffsetManager.persists(partitonOffset, props);

  try {
    jsc.start();
    jsc.awaitTermination();
  }catch (Exception ex ) {
    jsc.ssc().sc().cancelAllJobs();
    jsc.stop(true, false);
    System.exit(-1);
  }
}
 
Example 30
Project: deeplearning4j   File: StreamingContextUtils.java   Source Code and License Vote up 4 votes
public static <K> void foreach(JavaDStream<K> stream, VoidFunction<JavaRDD<K>> func) {
    stream.foreachRDD(func);
}
 
Example 31
Project: spark_log_data   File: LogDataWebinar.java   Source Code and License Vote up 3 votes
private static void processDStream(JavaDStream<String> dStream) {
    
    dStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {

        @Override
        public void call(JavaRDD<String> rdd) throws Exception {

            if (rdd.isEmpty()) {
                return;
            }

            JavaRDD<LogLine> logRdd = rdd.map(new Function<String, LogLine>() {

                @Override
                public LogLine call(String logText) throws Exception {

                    LogLine logLine = LogParser.parseLog(logText);
                    logger.info(logLine);

                    return logLine;
                }

            });

            writeToHdfs(logRdd);

        }
    });
}