Java Code Examples for org.apache.spark.streaming.api.java.JavaDStream

The following are top voted examples for showing how to use org.apache.spark.streaming.api.java.JavaDStream. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to generate more good examples.
Example 1
Project: gcp   File: Spark4KafkaNew.java   Source Code and License 11 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Kafka-New");
  
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    
    JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
        jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
        Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT),
        Collections.singleton(EXAMPLE_TOPIC));

    JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 2
Project: gcp   File: Spark2Streaming.java   Source Code and License 10 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Streaming");
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    //JavaDStream<SampleXML> records = jsc.textFileStream("input/").map(new ParseXML());
    //textFileStream process lines of files, so xml has to be 1 line to work //alternative below

    JavaRDD<String> files = jsc.sparkContext().wholeTextFiles("input/").map(tuple -> tuple._2());
    Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
    rddQueue.add(files);
    JavaDStream<String> records = jsc.queueStream(rddQueue);

    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 3
Project: gcp   File: Spark6BigQuery.java   Source Code and License 8 votes vote down vote up
public static void main(String[] args) throws InterruptedException, IOException {
  SparkConf sc = new SparkConf().setAppName("POC-BigQuery");
  
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(60000))) {
    JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
        jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
        Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT), Collections.singleton(EXAMPLE_TOPIC));

    Configuration conf = new Configuration();
    BigQueryConfiguration.configureBigQueryOutput(conf, BQ_EXAMPLE_TABLE, BQ_EXAMPLE_SCHEMA);
    conf.set("mapreduce.job.outputformat.class", BigQueryOutputFormat.class.getName());

    JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
    records.foreachRDD(rdd -> {
      System.out.printf("Amount of XMLs: %d\n", rdd.count());
      long time = System.currentTimeMillis();
      rdd.mapToPair(new PrepToBQ()).saveAsNewAPIHadoopDataset(conf);
      System.out.printf("Sent to BQ in %fs\n", (System.currentTimeMillis()-time)/1000f);
    });
    
    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 4
Project: SparkToParquet   File: AppMain.java   Source Code and License 8 votes vote down vote up
public static void main(String[] args) throws IOException {
	Flags.setFromCommandLineArgs(THE_OPTIONS, args);

	// 初始化Spark Conf.
	SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
	JavaSparkContext sc = new JavaSparkContext(conf);
	JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
	SQLContext sqlContext = new SQLContext(sc);

	// 初始化参数
	HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
	HashMap<String, String> kafkaParams = new HashMap<String, String>();
	kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());

	// 从Kafka Stream获取数据
	JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
			StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		private static final long serialVersionUID = 5266880065425088203L;

		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
		List<ApacheAccessLog> list = new ArrayList<>();
		try {
			// 映射每一行
			list.add(ApacheAccessLog.parseFromLogLine(line));
			return list;
		} catch (RuntimeException e) {
			return list;
		}
	}).cache();

	accessLogsDStream.foreachRDD(rdd -> {

		// rdd to DataFrame
		DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
		// 写入Parquet文件
		df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

		return null;
	});

	// 启动Streaming服务器
	jssc.start(); // 启动计算
	jssc.awaitTermination(); // 等待终止
}
 
Example 5
Project: arks-api   File: WordCount.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) 
{
 SparkConf conf = new SparkConf();
 conf.setAppName("Wordcount Background");
 conf.setMaster("local");
  
 
 JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(15));
 
 
 JavaDStream<String> lines = ssc.textFileStream("/home/rahul/DATASET");
 JavaDStream<String> words = lines.flatMap(WORDS_EXTRACTOR);
 JavaPairDStream<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
 JavaPairDStream<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);
 
 counter.print();
 
 ssc.start();
 
 ssc.awaitTermination();
 

 /*JavaRDD<String> file = context.textFile("/home/rahul/Desktop/palestine.txt");
 JavaRDD<String> words = file.flatMap(WORDS_EXTRACTOR);
 JavaPairRDD<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
 JavaPairRDD<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);
 counter.saveAsTextFile("/home/rahul/Desktop/wc"); 
 context.close();*/
}
 
Example 6
Project: iot-traffic-monitor   File: IoTTrafficDataProcessor.java   Source Code and License 7 votes vote down vote up
/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 * 
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

	// reduce by key and window (30 sec window and 10 sec slide).
	JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
			.mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
			.reduceByKeyAndWindow((a, b) -> a + b, Durations.seconds(30), Durations.seconds(10));

	// Transform to dstream of TrafficData
	JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("routeId", "routeid");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("totalCount", "totalcount");
	columnNameMappings.put("timeStamp", "timestamp");
	columnNameMappings.put("recordDate", "recorddate");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "window_traffic",
			CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)).saveToCassandra();
}
 
Example 7
Project: incubator-blur   File: BlurLoadSparkProcessor.java   Source Code and License 7 votes vote down vote up
public void run() throws IOException {
  SparkConf conf = new SparkConf();
  conf.setAppName(getAppName());
  conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER);
  JavaSparkUtil.packProjectJars(conf);
  setupSparkConf(conf);

  JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration());
  List<JavaDStream<T>> streamsList = getStreamsList(ssc);

  // Union all the streams if there is more than 1 stream
  JavaDStream<T> streams = unionStreams(ssc, streamsList);

  JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() {
    public Tuple2<String, RowMutation> call(T t) {
      RowMutation rowMutation = convert(t);
      return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation);
    }
  });

  pairDStream.foreachRDD(getFunction());

  ssc.start();
  ssc.awaitTermination();
}
 
Example 8
Project: Apache-Spark-2x-for-Java-Developers   File: StateFulProcessingExample.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");

		SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
				Durations.milliseconds(1000));
		JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999);
		jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint");

		JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> {
			ObjectMapper mapper = new ObjectMapper();
			return mapper.readValue(x, FlightDetails.class);
		});
		
		

		JavaPairDStream<String, FlightDetails> flightDetailsPairStream = flightDetailsStream
				.mapToPair(f -> new Tuple2<String, FlightDetails>(f.getFlightId(), f));

		Function3<String, Optional<FlightDetails>, State<List<FlightDetails>>, Tuple2<String, Double>> mappingFunc = (
				flightId, curFlightDetail, state) -> {
			List<FlightDetails> details = state.exists() ? state.get() : new ArrayList<>();

			boolean isLanded = false;

			if (curFlightDetail.isPresent()) {
				details.add(curFlightDetail.get());
				if (curFlightDetail.get().isLanded()) {
					isLanded = true;
				}
			}
			Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0);

			if (isLanded) {
				state.remove();
			} else {
				state.update(details);
			}
			return new Tuple2<String, Double>(flightId, avgSpeed);
		};

		JavaMapWithStateDStream<String, FlightDetails, List<FlightDetails>, Tuple2<String, Double>> streamWithState = flightDetailsPairStream
				.mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5)));
		
		streamWithState.print();
		jssc.start();
		jssc.awaitTermination();
	}
 
Example 9
Project: zipkin-sparkstreaming   File: SparkStreamingJob.java   Source Code and License 6 votes vote down vote up
static void streamSpansToStorage(
    JavaDStream<byte[]> stream,
    ReadSpans readSpans,
    AdjustAndConsumeSpansSharingTraceId adjustAndConsumeSpansSharingTraceId
) {
  JavaDStream<Span> spans = stream.flatMap(readSpans);

  // TODO: plug in some filter to drop spans regardless of trace ID
  // spans = spans.filter(spanFilter);

  JavaPairDStream<String, Iterable<Span>> tracesById = spans
      .mapToPair(s -> new Tuple2<>(Util.toLowerHex(s.traceIdHigh, s.traceId), s))
      .groupByKey();

  tracesById.foreachRDD(rdd -> {
    rdd.values().foreachPartition(adjustAndConsumeSpansSharingTraceId);
  });
}
 
Example 10
Project: jMetalSP   File: SimpleSparkStreamingCounterDataSource.java   Source Code and License 6 votes vote down vote up
@Override
public void run() {
	JMetalLogger.logger.info("Run method in the streaming data source invoked") ;
   JMetalLogger.logger.info("Directory: " + directoryName) ;

	JavaDStream<Integer> time = streamingContext
					.textFileStream(directoryName)
					.map(line -> Integer.parseInt(line)) ;

	time.foreachRDD(numbers -> {
		List<Integer> numberList = numbers.collect() ;
		for (Integer number : numberList) {
		  System.out.println(number) ;
       observable.setChanged();
			observable.notifyObservers(new SingleObservedData<Integer>(number));
		}
	}) ;
}
 
Example 11
Project: incubator-pirk   File: ComputeStreamingResponse.java   Source Code and License 6 votes vote down vote up
/**
 * Method to read in data from an allowed input source/format and perform the query
 */
public void performQuery() throws IOException, PIRException
{
  logger.info("Performing query: ");

  JavaDStream<MapWritable> inputRDD = null;
  if (dataInputFormat.equals(InputFormatConst.BASE_FORMAT))
  {
    inputRDD = readData();
  }
  else if (dataInputFormat.equals(InputFormatConst.ES))
  {
    inputRDD = readDataES();
  }
  else
  {
    throw new PIRException("Unknown data input format " + dataInputFormat);
  }

  performQuery(inputRDD);
}
 
Example 12
Project: beam   File: StreamingTransformTranslator.java   Source Code and License 6 votes vote down vote up
private static <T> TransformEvaluator<ConsoleIO.Write.Unbound<T>> print() {
  return new TransformEvaluator<ConsoleIO.Write.Unbound<T>>() {
    @Override
    public void evaluate(ConsoleIO.Write.Unbound<T> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked")
      JavaDStream<WindowedValue<T>> dstream =
          ((UnboundedDataset<T>) (context).borrowDataset(transform)).getDStream();
      dstream.map(WindowingHelpers.<T>unwindowFunction()).print(transform.getNum());
    }

    @Override
    public String toNativeString() {
      return ".print(...)";
    }
  };
}
 
Example 13
Project: net.jgp.labs.spark   File: StreamingIngestionFileSystemTextFileApp.java   Source Code and License 6 votes vote down vote up
private void start() {
	// Create a local StreamingContext with two working thread and batch interval of
	// 1 second
	SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount");
	JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));

	JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
	msgDataStream.print();

	jssc.start();
	try {
		jssc.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 14
Project: net.jgp.labs.spark   File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java   Source Code and License 6 votes vote down vote up
private void start() {
	// Create a local StreamingContext with two working thread and batch interval of
	// 1 second
	SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
	JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));

	JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());

	msgDataStream.print();
	// Create JavaRDD<Row>
	msgDataStream.foreachRDD(new RowProcessor());	

	jssc.start();
	try {
		jssc.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 15
Project: spark_log_data   File: LogDataWebinar.java   Source Code and License 6 votes vote down vote up
private static JavaDStream<String> createDStream(JavaStreamingContext javaStreamingContext, String hostName, int port) {
        
        JavaReceiverInputDStream<SparkFlumeEvent> flumeEventStream = FlumeUtils.createStream(javaStreamingContext, hostName, port);
        
        // Set different storage level 
//        flumeEventStream.persist(StorageLevel.MEMORY_AND_DISK_SER());
        
        JavaDStream<String> dStream = flumeEventStream.map(new Function<SparkFlumeEvent, String>() {

            @Override
            public String call(SparkFlumeEvent sparkFlumeEvent) throws Exception {

                byte[] bodyArray = sparkFlumeEvent.event().getBody().array();
                String logTxt = new String(bodyArray, "UTF-8");
                logger.info(logTxt);

                return logTxt;
            }
        });
        // dStream.print();
        
        return dStream;
    }
 
Example 16
Project: nats-connector-spark   File: SparkToNatsConnectorPool.java   Source Code and License 6 votes vote down vote up
/**
 * @param stream, the Spark Stream to publish to NATS
 * @param dataEncoder, the function used to encode the Spark Stream Records into the NATS Message Payloads
 */
public <V extends Object> void publishToNats(final JavaDStream<V> stream, final Function<V, byte[]> dataEncoder) {
	logger.trace("publishToNats(JavaDStream<String> stream)");
	stream.foreachRDD((VoidFunction<JavaRDD<V>>) rdd -> {
		logger.trace("stream.foreachRDD");
		rdd.foreachPartitionAsync(objects -> {
			logger.trace("rdd.foreachPartition");
			final SparkToNatsConnector<?> connector = getConnector();
			while(objects.hasNext()) {
				final V obj = objects.next();
				logger.trace("Will publish {}", obj);
				connector.publishToNats(dataEncoder.apply(obj));
			}
			returnConnector(connector);  // return to the pool for future reuse
		});
	});
}
 
Example 17
Project: nats-connector-spark   File: KeyValueSparkToStandardNatsConnectorLifecycleTest.java   Source Code and License 6 votes vote down vote up
protected void publishToNats(final String subject1, final String subject2, final int partitionsNb) {
	final JavaDStream<String> lines = ssc.textFileStream(tempDir.getAbsolutePath()).repartition(partitionsNb);		
	
	JavaPairDStream<String, String> stream1 = 
			lines.mapToPair((PairFunction<String, String, String>) str -> {
								return new Tuple2<String, String>(subject1, str);
							});
	JavaPairDStream<String, String> stream2 = 
			lines.mapToPair((PairFunction<String, String, String>) str -> {
								return new Tuple2<String, String>(subject2, str);
							});
	final JavaPairDStream<String, String> stream = stream1.union(stream2);
	
	if (logger.isDebugEnabled()) {
		stream.print();
	}		
	
	SparkToNatsConnectorPool
		.newPool()
		.withNatsURL(NATS_SERVER_URL)
		.withConnectionTimeout(Duration.ofSeconds(2))
		.publishToNatsAsKeyValue(stream);
}
 
Example 18
Project: kite-apps   File: KafkaOutput.java   Source Code and License 6 votes vote down vote up
/**
 * Writes the content of the stream to the Kafka topic
 * behind this producer.
 */
@edu.umd.cs.findbugs.annotations.SuppressWarnings(
    value="SE_INNER_CLASS", justification="Uses state from outer class.")
public void write (JavaDStream<T> stream) {

  stream.foreachRDD(new Function<JavaRDD<T>, Void>() {
    @Override
    public Void call(JavaRDD<T> rdd) throws Exception {

      write(rdd);

      return null;
    }
  });
}
 
Example 19
Project: near-image-replica-detection   File: StreamingReplicaDetector.java   Source Code and License 6 votes vote down vote up
protected void queryTweets(JavaReceiverInputDStream<Status> tweets, int rank) {
	
	// Compute sketches
	JavaPairDStream<ImageInfo, ImageFeature> imFeatures = computeImageFeatures(tweets);
	
	JavaPairDStream<ImageInfo, ImageFeature> sketches = imFeatures.mapValues(new SketchProcedure(indParams.getSketchFunction(), 
			indParams.getNumTables()));
	
	// Query specific and filter by hamming distance
	JavaPairDStream<ImageFeature, ImageFeature> candidates = system.queryFeaturesStreaming(conn,indParams, sketches);
	JavaPairDStream<ImageFeature, ImageFeature> filteredHamming = 
			candidates.filter(new HammingFiltering(indParams.getHammingDistance()));
	
	// Group by image and assign weights
	JavaDStream<ImageMatch> matchedIds = filteredHamming.map(new MatchExtractorStreaming());
	JavaPairDStream<ImageMatch, Long> result = matchedIds.countByValue();

	// Filter by weight if requested
	if (rank > 0) {
		result = result.filter(new WeightFiltering(rank));
	}
	
	// Print results
	result.print();
}
 
Example 20
Project: near-image-replica-detection   File: StreamingReplicaDetector.java   Source Code and License 6 votes vote down vote up
protected JavaPairDStream<ImageInfo, ImageFeature> computeImageFeatures(JavaReceiverInputDStream<Status> tweets) {
	
	JavaDStream<ImageInfo> imgs = tweets.mapPartitions(new TweetsToImagesTask());
	
	JavaPairDStream<ImageInfo, ImageFeature> features = imgs.flatMapToPair(new ComputeFeatures(descParams, ProviderType.TWITTER));
	JavaPairDStream<ImageInfo, ImageFeature> filtered = features;
	
	// Filter descriptors if needed
	if (filtParams.getFilteringType().equals(FilteringType.ENTROPY)) {
		filtered = features.filter(new EntropyFiltering(filtParams.getThresh()));
	}
	else if (filtParams.getFilteringType().equals(FilteringType.VARIANCE)) {
		filtered = features.filter(new VarianceFiltering(filtParams.getThresh()));
	}
	
	// Logscale features if needed
	if (filtParams.isLogScaleEnabled()) {
		filtered = filtered.mapValues(new LogScaleFunction());
	}
	
	// Build sketches
	return filtered;
}
 
Example 21
Project: spark-dataflow   File: StreamingTransformTranslator.java   Source Code and License 6 votes vote down vote up
private static <K, V> TransformEvaluator<KafkaIO.Read.Unbound<K, V>> kafka() {
  return new TransformEvaluator<KafkaIO.Read.Unbound<K, V>>() {
    @Override
    public void evaluate(KafkaIO.Read.Unbound<K, V> transform, EvaluationContext context) {
      StreamingEvaluationContext sec = (StreamingEvaluationContext) context;
      JavaStreamingContext jssc = sec.getStreamingContext();
      Class<K> keyClazz = transform.getKeyClass();
      Class<V> valueClazz = transform.getValueClass();
      Class<? extends Decoder<K>> keyDecoderClazz = transform.getKeyDecoderClass();
      Class<? extends Decoder<V>> valueDecoderClazz = transform.getValueDecoderClass();
      Map<String, String> kafkaParams = transform.getKafkaParams();
      Set<String> topics = transform.getTopics();
      JavaPairInputDStream<K, V> inputPairStream = KafkaUtils.createDirectStream(jssc, keyClazz,
              valueClazz, keyDecoderClazz, valueDecoderClazz, kafkaParams, topics);
      JavaDStream<WindowedValue<KV<K, V>>> inputStream =
          inputPairStream.map(new Function<Tuple2<K, V>, KV<K, V>>() {
        @Override
        public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
          return KV.of(t2._1(), t2._2());
        }
      }).map(WindowingHelpers.<KV<K, V>>windowFunction());
      sec.setStream(transform, inputStream);
    }
  };
}
 
Example 22
Project: spark-dataflow   File: StreamingTransformTranslator.java   Source Code and License 6 votes vote down vote up
private static <T> TransformEvaluator<Flatten.FlattenPCollectionList<T>> flattenPColl() {
  return new TransformEvaluator<Flatten.FlattenPCollectionList<T>>() {
    @SuppressWarnings("unchecked")
    @Override
    public void evaluate(Flatten.FlattenPCollectionList<T> transform, EvaluationContext context) {
      StreamingEvaluationContext sec = (StreamingEvaluationContext) context;
      PCollectionList<T> pcs = sec.getInput(transform);
      JavaDStream<WindowedValue<T>> first =
          (JavaDStream<WindowedValue<T>>) sec.getStream(pcs.get(0));
      List<JavaDStream<WindowedValue<T>>> rest = Lists.newArrayListWithCapacity(pcs.size() - 1);
      for (int i = 1; i < pcs.size(); i++) {
        rest.add((JavaDStream<WindowedValue<T>>) sec.getStream(pcs.get(i)));
      }
      JavaDStream<WindowedValue<T>> dstream = sec.getStreamingContext().union(first, rest);
      sec.setStream(transform, dstream);
    }
  };
}
 
Example 23
Project: laughing-octo-sansa   File: TestSparkKafkaReceiverApproach.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) {
    if (args.length < 4) {
        System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>");
        System.exit(1);
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
    // Create the context with a 1 second batch size
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
    int numThreads = Integer.parseInt(args[3]);
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    String[] topics = args[2].split(",");
    for (String topic : topics) {
        topicMap.put(topic, numThreads);
    }
    JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, args[0], args[1],
            topicMap);
    JavaDStream<String> lines = messages.map(tuple2 -> tuple2._2());
    JavaDStream<String> words = lines.flatMap(x -> Lists.newArrayList(SPACE.split(x)));
    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<String, Integer>(s, 1)).reduceByKey(
            (i1, i2) -> i1 + i2);
    wordCounts.print();
    jssc.start();
    jssc.awaitTermination();
}
 
Example 24
Project: learning-spark-examples   File: StreamingLogInput.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	String master = args[0];
	JavaSparkContext sc = new JavaSparkContext(master, "StreamingLogInput");
   // Create a StreamingContext with a 1 second batch size
   JavaStreamingContext jssc = new JavaStreamingContext(sc, new Duration(1000));
   // Create a DStream from all the input on port 7777
   JavaDStream<String> lines = jssc.socketTextStream("localhost", 7777);
   // Filter our DStream for lines with "error"
   JavaDStream<String> errorLines = lines.filter(new Function<String, Boolean>() {
       public Boolean call(String line) {
         return line.contains("error");
       }});
   // Print out the lines with errors, which causes this DStream to be evaluated
   errorLines.print();
   // start our streaming context and wait for it to "finish"
   jssc.start();
   // Wait for 10 seconds then exit. To run forever call without a timeout
   jssc.awaitTermination(10000);
   // Stop the streaming context
   jssc.stop();
}
 
Example 25
Project: spork-streaming   File: SortConverter.java   Source Code and License 6 votes vote down vote up
@Override
public JavaDStream<Tuple> convert(List<JavaDStream<Tuple>> predecessors,
		final POSort sortOperator) throws IOException {		
       SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1);
       JavaDStream<Tuple> rdd = predecessors.get(0);
       DStream<Tuple2<Tuple, Object>> rddPair =
               rdd.dstream().map(new ToKeyValueFunction(),
                       SparkUtil.<Tuple, Object>getTuple2Manifest());

       JavaPairDStream<Tuple, Object> r = new JavaPairDStream<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class),
                                                       SparkUtil.getManifest(Object.class));

       JavaPairDStream<Tuple, Object> sorted = r.transform(
       	     new Function<JavaPairRDD<Tuple, Object>, JavaPairRDD<Tuple, Object>>() {
       	         public JavaPairRDD<Tuple, Object> call(JavaPairRDD<Tuple, Object> in) throws Exception {
       	           return in.sortByKey(sortOperator.getmComparator(),false);
       	         }
       	       });
       JavaDStream<Tuple> mapped = new JavaDStream<Tuple>(sorted.dstream().map(new ToValueFunction(),SparkUtil.getManifest(Tuple.class)), SparkUtil.getManifest(Tuple.class));

       return mapped;
   }
 
Example 26
Project: searchanalytics-bigdata   File: SparkStreamServiceImpl.java   Source Code and License 6 votes vote down vote up
@Override
public void startHDFSTxtFileStreams() {
	String hdfsUri = hadoopClusterService.getHDFSUri() + "/searchevents"
			+ getCurrentStreamUri();

	QueryStringJDStreams queryStringJDStreams = new QueryStringJDStreams();

	JavaDStream<String> fileStream = jssc.textFileStream(hdfsUri);
	queryStringJDStreams.topQueryStringsCountInLastOneHour(fileStream);

	queryStringJDStreams.topProductViewsCountInLastOneHour(fileStream);

	LOG.debug("Starting streaming context!");
	jssc.start();
	LOG.debug("Streaming context running!");
}
 
Example 27
Project: searchanalytics-bigdata   File: SparkStreamServiceImpl.java   Source Code and License 6 votes vote down vote up
@Override
	public void startFlumeStream() {
		JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(
				jssc, "localhost", 41111, StorageLevels.MEMORY_AND_DISK);

		QueryStringJDStreams queryStringJDStreams = new QueryStringJDStreams();

		// Run top top search query string stream
		queryStringJDStreams
				.topQueryStringsCountInLastOneHourUsingSparkFlumeEvent(flumeStream);

		// Run top product view stream
		//TODO: uncomment to get both stats.
//		queryStringJDStreams
//				.topProductViewsCountInLastOneHourUsingSparkFlumeEvent(flumeStream);
		jssc.start();
	}
 
Example 28
Project: elasticsearch-hadoop   File: AbstractJavaEsSparkStreamingTest.java   Source Code and License 6 votes vote down vote up
@Test
public void testEsRDDWriteWIthMappingId() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("number", 1);
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("number", 2);
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put("es.mapping.id", "number");

    String target = wrapIndex("spark-streaming-test-scala-id-write/data");

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(target + "/1"));
    assertTrue(RestUtils.exists(target + "/2"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}
 
Example 29
Project: elasticsearch-hadoop   File: AbstractJavaEsSparkStreamingTest.java   Source Code and License 6 votes vote down vote up
@Test
public void testEsRDDWriteWithDynamicMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("number", 3);
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("number", 4);
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex("spark-streaming-test-scala-dyn-id-write/data");

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(target + "/3"));
    assertTrue(RestUtils.exists(target + "/4"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}
 
Example 30
Project: Apache-Spark-2x-for-Java-Developers   File: WordCountTransformOpEx.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  
      System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
   List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
	    

   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  
   wordCounts.print();
   
JavaPairDStream<String, Integer> joinedDstream = wordCounts
		.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
			@Override
			public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
				JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair(
						new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() {
							@Override
							public Tuple2<String, Integer> call(
									Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception {
								return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2()));
							}
						});
				return modRDD;
			}
		});

   joinedDstream.print();
   streamingContext.start();
   streamingContext.awaitTermination();
 }
 
Example 31
Project: Apache-Spark-2x-for-Java-Developers   File: WordCountSocketStateful.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 System.setProperty("hadoop.home.dir", "E:\\hadoop");

   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   streamingContext.checkpoint("E:\\hadoop\\checkpoint");
// Initial state RDD input to mapWithState
   @SuppressWarnings("unchecked")
   List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
   
   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  


  // Update the cumulative count function
  Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc =
      new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
        @Override
        public Tuple2<String, Integer> call(String word, Optional<Integer> one,
            State<Integer> state) {
          int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
          Tuple2<String, Integer> output = new Tuple2<>(word, sum);
          state.update(sum);
          return output;
        }
      };

  // DStream made of get cumulative counts that get updated in every batch
  JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

  stateDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example 32
Project: Apache-Spark-2x-for-Java-Developers   File: FileStreamingEx.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) {
   	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
   	//Logger rootLogger = LogManager.getRootLogger();
  		//rootLogger.setLevel(Level.WARN); 
       SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
       String inputDirectory="E:\\hadoop\\streamFolder\\";
    
       JavaSparkContext sc = new JavaSparkContext(conf);
       JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1));
      // streamingContext.checkpoint("E:\\hadoop\\checkpoint");
       Logger rootLogger = LogManager.getRootLogger();
  		rootLogger.setLevel(Level.WARN); 
  		
  		JavaDStream<String> streamfile = streamingContext.textFileStream(inputDirectory);
  		streamfile.print();
  		streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x)));
  		
  			   		
  		JavaPairDStream<LongWritable, Text> streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class);
  	 streamedFile.print();
  		
  	 streamingContext.start();
  	 

       try {
		streamingContext.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 33
Project: Apache-Spark-2x-for-Java-Developers   File: WordCountSocketJava8Ex.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example 34
Project: Apache-Spark-2x-for-Java-Developers   File: WordCountRecoverableEx.java   Source Code and License 5 votes vote down vote up
protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
	SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
	JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
	streamingContext.checkpoint(checkpointDirectory);
	// Initial state RDD input to mapWithState
	@SuppressWarnings("unchecked")
	List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
	JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);

	JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);

	JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
			.reduceByKey((count1, count2) -> count1 + count2);

	// Update the cumulative count function
	Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
		@Override
		public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
			int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
			Tuple2<String, Integer> output = new Tuple2<>(word, sum);
			state.update(sum);
			return output;
		}
	};

	// DStream made of get cumulative counts that get updated in every batch
	JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
			.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

	stateDstream.print();
	return streamingContext;
}
 
Example 35
Project: gcp   File: Spark7OffsetsToZK.java   Source Code and License 5 votes vote down vote up
private static JavaPairDStream<String, String> startFromOffsets(JavaStreamingContext jsc, String offsetsInput) {
  Map<TopicAndPartition, Long> map = new HashMap<>();
  for (String partition : offsetsInput.split(",")) {
    String[] offset = partition.split(":");
    map.put(new TopicAndPartition(EXAMPLE_TOPIC, Integer.parseInt(offset[0])), Long.parseLong(offset[1]));
  }

  JavaDStream<String> stream = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class,
      StringDecoder.class, String.class, Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT), map,
      msg -> msg.message());
  
  return stream.transformToPair(new ToPairWithOffset<>(str -> str));
}
 
Example 36
Project: gcp   File: Spark3Kafka.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Kafka");
  
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    
    JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(
        jsc, ZK_HOST_PORT, "a_group_id", Collections.singletonMap(EXAMPLE_TOPIC, 1));

    JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 37
Project: zipkin-sparkstreaming   File: KafkaStreamFactory.java   Source Code and License 5 votes vote down vote up
@Override public JavaDStream<byte[]> create(JavaStreamingContext jsc) {
  return KafkaUtils.createDirectStream(
      jsc,
      byte[].class,
      byte[].class,
      DefaultDecoder.class,
      DefaultDecoder.class,
      kafkaParams(),
      Collections.singleton(topic()))
      .map(m -> m._2); // get value
}
 
Example 38
Project: assistance-platform-server   File: SparkService.java   Source Code and License 5 votes vote down vote up
@Override
public <T extends Event> JavaDStream<T> getEventReceiverStream(JavaStreamingContext sc,
    Class<T> eventType) {
  UserFilteredMessagingServiceReceiver<T> messagingReceiver =
      new UserFilteredMessagingServiceReceiver<T>(bundle.getModuleId(),
          PlatformClientFactory.getInstance().getUsedHost(), eventType);

  JavaDStream<T> stream = sc.receiverStream(messagingReceiver);

  return stream;
}
 
Example 39
Project: zipkin-aws   File: KinesisStreamFactory.java   Source Code and License 5 votes vote down vote up
@Override
public JavaDStream<byte[]> create(JavaStreamingContext jsc) {
  if (awsAccessKeyId != null) {
    return KinesisUtils.createStream(
        jsc,
        app,
        stream,
        endpoint,
        regionName,
        initialPositionInStream,
        checkpointInterval,
        storageLevel,
        awsAccessKeyId,
        awsSecretKey
    );
  }
  return KinesisUtils.createStream(
      jsc,
      app,
      stream,
      endpoint,
      regionName,
      initialPositionInStream,
      checkpointInterval,
      storageLevel
  );
}
 
Example 40
Project: incubator-pulsar   File: SparkStreamingPulsarReceiverExample.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("pulsar-spark");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));

    ClientConfiguration clientConf = new ClientConfiguration();
    ConsumerConfiguration consConf = new ConsumerConfiguration();
    String url = "pulsar://localhost:6650/";
    String topic = "persistent://sample/standalone/ns1/topic1";
    String subs = "sub1";

    JavaReceiverInputDStream<byte[]> msgs = jssc
            .receiverStream(new SparkStreamingPulsarReceiver(clientConf, consConf, url, topic, subs));

    JavaDStream<Integer> isContainingPulsar = msgs.flatMap(new FlatMapFunction<byte[], Integer>() {
        @Override
        public Iterator<Integer> call(byte[] msg) {
            return Arrays.asList(((new String(msg)).indexOf("Pulsar") != -1) ? 1 : 0).iterator();
        }
    });

    JavaDStream<Integer> numOfPulsar = isContainingPulsar.reduce(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    numOfPulsar.print();

    jssc.start();
    jssc.awaitTermination();
}
 
Example 41
Project: incubator-pirk   File: ComputeStreamingResponse.java   Source Code and License 5 votes vote down vote up
/**
 * Method to perform the query given an input JavaDStream of JSON
 * 
 */
public void performQuery(JavaDStream<MapWritable> input)
{
  logger.info("Performing query: ");

  // Process non-overlapping windows of data of duration windowLength seconds
  // If we are using queue streams, there is no need to window
  if (!useQueueStream)
  {
    input.window(Durations.seconds(windowLength), Durations.seconds(windowLength));
  }

  // Extract the selectors for each dataElement based upon the query type
  // and perform a keyed hash of the selectors
  JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars));

  // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking...
  JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey();

  // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row
  JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars));

  // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object
  encryptedColumnCalc(encRowRDD);

  // Start the streaming computation
  start();
}
 
Example 42
Project: envelope   File: StreamingStep.java   Source Code and License 5 votes vote down vote up
@SuppressWarnings("rawtypes")
public JavaDStream<?> getStream() throws Exception {
  JavaDStream stream = ((StreamInput)getInput()).getDStream();
  
  if (doesRepartition()) {
    stream = repartition(stream);
  }

  return stream;
}
 
Example 43
Project: envelope   File: DummyStreamInput.java   Source Code and License 5 votes vote down vote up
@Override
public JavaDStream<Long> getDStream() throws Exception {
  List<Long> list = Lists.newArrayList();
  for (int i = 0; i < rowsPerBatch; i++) {
    list.add(counter++);
  }
  JavaRDD<Long> longs = Contexts.getJavaStreamingContext().sparkContext().parallelize(list);
  Queue<JavaRDD<Long>> queue = Queues.newLinkedBlockingQueue();
  queue.add(longs);
  LOG.info("Created stream queue with {} rows", list.size());
  return Contexts.getJavaStreamingContext().queueStream(queue, true);
}
 
Example 44
Project: iot-traffic-monitor   File: IoTTrafficDataProcessor.java   Source Code and License 5 votes vote down vote up
/**
 * Method to get total traffic counts of different type of vehicles for each route.
 * 
 * @param filteredIotDataStream IoT data stream
 */
public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

	// We need to get count of vehicle group by routeId and vehicleType
	JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
			.mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
			.reduceByKey((a, b) -> a + b);
	
	// Need to keep state for total count
	JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair = countDStreamPair
			.mapWithState(StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600)));//maintain state for one hour

	// Transform to dstream of TrafficData
	JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2);
	JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("routeId", "routeid");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("totalCount", "totalcount");
	columnNameMappings.put("timeStamp", "timestamp");
	columnNameMappings.put("recordDate", "recorddate");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "total_traffic",
			CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings)).saveToCassandra();
}
 
Example 45
Project: iot-traffic-monitor   File: IoTTrafficDataProcessor.java   Source Code and License 5 votes vote down vote up
/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 * 
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) {
	 
	// Filter by routeId,vehicleType and in POI range
	JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
			.filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
					&& iot.getVehicleType().contains(broadcastPOIValues.value()._3())
					&& GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
							Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
							broadcastPOIValues.value()._1().getLongitude(),
							broadcastPOIValues.value()._1().getRadius())));

	// pair with poi
	JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered
			.mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()));

	// Transform to dstream of POITrafficData
	JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("vehicleId", "vehicleid");
	columnNameMappings.put("distance", "distance");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("timeStamp", "timestamp");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream)
			.writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings))
			.withConstantTTL(120)//keeping data for 2 minutes
			.saveToCassandra();
}
 
Example 46
Project: beam   File: SparkGroupAlsoByWindowViaWindowSet.java   Source Code and License 5 votes vote down vote up
public static <K, InputT, W extends BoundedWindow>
    JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> groupAlsoByWindow(
        final JavaDStream<WindowedValue<KV<K, Iterable<WindowedValue<InputT>>>>> inputDStream,
        final Coder<K> keyCoder,
        final Coder<WindowedValue<InputT>> wvCoder,
        final WindowingStrategy<?, W> windowingStrategy,
        final SerializablePipelineOptions options,
        final List<Integer> sourceIds,
        final String transformFullName) {

  final PairDStreamFunctions<ByteArray, byte[]> pairDStream =
      buildPairDStream(inputDStream, keyCoder, wvCoder);

  // use updateStateByKey to scan through the state and update elements and timers.
  final UpdateStateByKeyFunction<K, InputT, W> updateFunc =
      new UpdateStateByKeyFunction<>(
          sourceIds,
          windowingStrategy,
          (FullWindowedValueCoder<InputT>) wvCoder, keyCoder, options, transformFullName
      );

  final DStream<
          Tuple2</*K*/ ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/ List<byte[]>>>>
      firedStream =
          pairDStream.updateStateByKey(
              updateFunc,
              pairDStream.defaultPartitioner(pairDStream.defaultPartitioner$default$1()),
              true,
              JavaSparkContext$.MODULE$.<Tuple2<StateAndTimers, List<byte[]>>>fakeClassTag());

  checkpointIfNeeded(firedStream, options);

  // filter state-only output (nothing to fire) and remove the state from the output.
  return stripStateValues(firedStream, keyCoder, (FullWindowedValueCoder<InputT>) wvCoder);
}
 
Example 47
Project: beam   File: TranslationUtils.java   Source Code and License 5 votes vote down vote up
/** Transform a pair stream into a value stream. */
public static <T1, T2> JavaDStream<T2> dStreamValues(JavaPairDStream<T1, T2> pairDStream) {
  return pairDStream.map(
      new Function<Tuple2<T1, T2>, T2>() {
        @Override
        public T2 call(Tuple2<T1, T2> v1) throws Exception {
          return v1._2();
        }
      });
}
 
Example 48
Project: beam   File: StreamingTransformTranslator.java   Source Code and License 5 votes vote down vote up
private static <T, W extends BoundedWindow> TransformEvaluator<Window.Assign<T>> window() {
  return new TransformEvaluator<Window.Assign<T>>() {
    @Override
    public void evaluate(final Window.Assign<T> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked")
      UnboundedDataset<T> unboundedDataset =
          ((UnboundedDataset<T>) context.borrowDataset(transform));
      JavaDStream<WindowedValue<T>> dStream = unboundedDataset.getDStream();
      JavaDStream<WindowedValue<T>> outputStream;
      if (TranslationUtils.skipAssignWindows(transform, context)) {
        // do nothing.
        outputStream = dStream;
      } else {
        outputStream = dStream.transform(
            new Function<JavaRDD<WindowedValue<T>>, JavaRDD<WindowedValue<T>>>() {
          @Override
          public JavaRDD<WindowedValue<T>> call(JavaRDD<WindowedValue<T>> rdd) throws Exception {
            return rdd.map(new SparkAssignWindowFn<>(transform.getWindowFn()));
          }
        });
      }
      context.putDataset(transform,
          new UnboundedDataset<>(outputStream, unboundedDataset.getStreamSources()));
    }

    @Override
    public String toNativeString() {
      return "map(new <windowFn>())";
    }
  };
}
 
Example 49
Project: beam   File: StreamingTransformTranslator.java   Source Code and License 5 votes vote down vote up
private static <K, V, W extends BoundedWindow> TransformEvaluator<Reshuffle<K, V>> reshuffle() {
  return new TransformEvaluator<Reshuffle<K, V>>() {
    @Override
    public void evaluate(Reshuffle<K, V> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked") UnboundedDataset<KV<K, V>> inputDataset =
          (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
      List<Integer> streamSources = inputDataset.getStreamSources();
      JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
      @SuppressWarnings("unchecked")
      final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
      @SuppressWarnings("unchecked")
      final WindowingStrategy<?, W> windowingStrategy =
          (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();

      final WindowedValue.WindowedValueCoder<V> wvCoder =
          WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());

      JavaDStream<WindowedValue<KV<K, V>>> reshuffledStream =
          dStream.transform(new Function<JavaRDD<WindowedValue<KV<K, V>>>,
              JavaRDD<WindowedValue<KV<K, V>>>>() {
            @Override
            public JavaRDD<WindowedValue<KV<K, V>>> call(
                JavaRDD<WindowedValue<KV<K, V>>> rdd) throws Exception {
              return GroupCombineFunctions.reshuffle(rdd, coder.getKeyCoder(), wvCoder);
            }
          });

      context.putDataset(transform, new UnboundedDataset<>(reshuffledStream, streamSources));
    }

    @Override public String toNativeString() {
      return "repartition(...)";
    }
  };
}
 
Example 50
Project: beam   File: SparkUnboundedSource.java   Source Code and License 5 votes vote down vote up
private static void checkpointStream(JavaDStream<?> dStream,
                                     SparkPipelineOptions options) {
  long checkpointDurationMillis = options.getCheckpointDurationMillis();
  if (checkpointDurationMillis > 0) {
    dStream.checkpoint(new Duration(checkpointDurationMillis));
  }
}
 
Example 51
Project: nats-connector-spark   File: UnitTestUtilities.java   Source Code and License 5 votes vote down vote up
public static JavaPairDStream<String, String> getJavaPairDStream(final File tempDir, final JavaStreamingContext ssc, final String subject1) {
	final JavaDStream<String> lines = ssc.textFileStream(tempDir.getAbsolutePath());
	JavaPairDStream<String, String> keyValues = lines.mapToPair((PairFunction<String, String, String>) str -> {
						return new Tuple2<String, String>(subject1 + "." + str, str);
					});
	return keyValues;
}
 
Example 52
Project: nats-connector-spark   File: AbstractNatsToSparkTest.java   Source Code and License 5 votes vote down vote up
protected void validateTheReceptionOfMessages(JavaStreamingContext ssc,
		JavaReceiverInputDStream<String> stream) throws InterruptedException {
	JavaDStream<String> messages = stream.repartition(3);

	ExecutorService executor = Executors.newFixedThreadPool(6);

	final int nbOfMessages = 5;
	NatsPublisher np = getNatsPublisher(nbOfMessages);
	
	if (logger.isDebugEnabled()) {
		messages.print();
	}
	
	messages.foreachRDD(new VoidFunction<JavaRDD<String>>() {
		private static final long serialVersionUID = 1L;

		@Override
		public void call(JavaRDD<String> rdd) throws Exception {
			logger.debug("RDD received: {}", rdd.collect());
			
			final long count = rdd.count();
			if ((count != 0) && (count != nbOfMessages)) {
				rightNumber = false;
				logger.error("The number of messages received should have been {} instead of {}.", nbOfMessages, count);
			}
			
			TOTAL_COUNT.getAndAdd((int) count);
			
			atLeastSomeData = atLeastSomeData || (count > 0);
			
			for (String str :rdd.collect()) {
				if (! str.startsWith(NatsPublisher.NATS_PAYLOAD)) {
						payload = str;
					}
			}
		}			
	});
	
	closeTheValidation(ssc, executor, nbOfMessages, np);		
}
 
Example 53
Project: nats-connector-spark   File: AbstractNatsToSparkTest.java   Source Code and License 5 votes vote down vote up
protected void validateTheReceptionOfIntegerMessages(JavaStreamingContext ssc, 
		JavaReceiverInputDStream<Integer> stream) throws InterruptedException {
	JavaDStream<Integer> messages = stream.repartition(3);

	ExecutorService executor = Executors.newFixedThreadPool(6);

	final int nbOfMessages = 5;
	NatsPublisher np = getNatsPublisher(nbOfMessages);
	
	if (logger.isDebugEnabled()) {
		messages.print();
	}
	
	messages.foreachRDD(new VoidFunction<JavaRDD<Integer>>() {
		private static final long serialVersionUID = 1L;

		@Override
		public void call(JavaRDD<Integer> rdd) throws Exception {
			logger.debug("RDD received: {}", rdd.collect());
			
			final long count = rdd.count();
			if ((count != 0) && (count != nbOfMessages)) {
				rightNumber = false;
				logger.error("The number of messages received should have been {} instead of {}.", nbOfMessages, count);
			}
			
			TOTAL_COUNT.getAndAdd((int) count);
			
			atLeastSomeData = atLeastSomeData || (count > 0);
			
			for (Integer value :rdd.collect()) {
				if (value < NatsPublisher.NATS_PAYLOAD_INT) {
						payload = value.toString();
					}
			}
		}			
	});
	
	closeTheValidation(ssc, executor, nbOfMessages, np);
}
 
Example 54
Project: nats-connector-spark   File: SparkToStandardNatsConnectorLifecycleTest.java   Source Code and License 5 votes vote down vote up
protected void publishToNats(final String subject1, final String subject2, final int partitionsNb) {
	final JavaDStream<String> lines = ssc.textFileStream(tempDir.getAbsolutePath()).repartition(partitionsNb);		
	
	SparkToNatsConnectorPool
		.newPool()
		.withNatsURL(NATS_SERVER_URL)
		.withConnectionTimeout(Duration.ofSeconds(2))
		.withSubjects(DEFAULT_SUBJECT, subject1, subject2)
		.publishToNats(lines);
}
 
Example 55
Project: federator   File: SparkStreaming.java   Source Code and License 5 votes vote down vote up
@Override
public String insert(Entity entity, final Set<Value> values) throws ParseException, Exception {

	JavaDStream<Value> cache = lines.flatMap(new FlatMapFunction<String, Value>() {

		@Override
		public Iterable<Value> call(String x) {
			return values;
		}
	});

	cache.persist();
	
	return entity.getId();
}
 
Example 56
Project: federator   File: SparkRedisStreaming.java   Source Code and License 5 votes vote down vote up
@Override
public String insert(Entity entity, final Set<Value> values) throws ParseException, Exception {

	JavaDStream<Value> cache = lines.flatMap(new FlatMapFunction<String, Value>() {

		@Override
		public Iterable<Value> call(String x) {
			return values;
		}
	});

	cache.persist();
			
	return entity.getId();
}
 
Example 57
Project: StreamBench   File: StreamKMeans.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) {

//        String inputFile = StreamKMeans.class.getClassLoader().getResource("centroids.txt").getFile();
        SparkConf sparkConf = new SparkConf().setMaster("spark://master:7077").setAppName("JavaKMeans");

        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(1000));

        HashSet<String> topicsSet = new HashSet<>();
        topicsSet.add("KMeans");
        HashMap<String, String> kafkaParams = new HashMap<>();
//        kafkaParams.put("metadata.broker.list", "kafka1:9092,kafka2:9092,kafka3:9092");
        kafkaParams.put("metadata.broker.list", "localhost:9092");
        kafkaParams.put("auto.offset.reset", "largest");
        kafkaParams.put("zookeeper.connect", "zoo1:2181");
        kafkaParams.put("group.id", "spark");

        // Create direct kafka stream with brokers and topics
        JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(
                jssc,
                String.class,
                String.class,
                StringDecoder.class,
                StringDecoder.class,
                kafkaParams,
                topicsSet
        );

        JavaDStream<Vector> points = lines.map(new ParseKafkaString()).map(new ParsePoint());

        Vector[] initCentroids = loadInitCentroids();
        double[] weights = new double[96];
        for (int i = 0; i < 96; i++) {
            weights[i] = 1.0 / 96;
        }

        final StreamingKMeans model = new StreamingKMeans()
                .setK(96)
                .setDecayFactor(0)
                .setInitialCenters(initCentroids, weights);

        model.trainOn(points);

        points.foreachRDD(new Function2<JavaRDD<Vector>, Time, Void>() {
            @Override
            public Void call(JavaRDD<Vector> vectorJavaRDD, Time time) throws Exception {
                Vector[] vector = model.latestModel().clusterCenters();
                for (int i = 0; i < vector.length; i++) {
                    logger.warn(vector[i].toArray()[0] + "\t" + vector[i].toArray()[1]);
                }
                return null;
            }
        });

        jssc.addStreamingListener(new PerformanceStreamingListener());
        jssc.start();
        jssc.awaitTermination();
    }
 
Example 58
Project: StreamBench   File: SparkWorkloadOperator.java   Source Code and License 5 votes vote down vote up
@Override
public WindowedWorkloadOperator<T> window(TimeDurations windowDuration,
                                          TimeDurations slideDuration) {
    Duration windowDurations = Utils.timeDurationsToSparkDuration(windowDuration);
    Duration slideDurations = Utils.timeDurationsToSparkDuration(slideDuration);

    JavaDStream<T> windowedStream = dStream.window(windowDurations, slideDurations);
    return new SparkWindowedWorkloadOperator<>(windowedStream, parallelism);
}
 
Example 59
Project: StreamBench   File: SparkOperatorCreater.java   Source Code and License 5 votes vote down vote up
@Override
public SparkWorkloadOperator<WithTime<String>> stringStreamFromKafkaWithTime(String zkConStr,
                                                                             String kafkaServers,
                                                                             String group,
                                                                             String topics,
                                                                             String offset,
                                                                             String componentId,
                                                                             int parallelism) {
    HashSet<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", kafkaServers);
    kafkaParams.put("auto.offset.reset", offset);
    kafkaParams.put("zookeeper.connect", zkConStr);
    kafkaParams.put("group.id", group);

    // Create direct kafka stream with brokers and topics
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet
    );

    JavaDStream<WithTime<String>> lines = messages.map(mapFunctionWithTime);

    return new SparkWorkloadOperator<>(lines, parallelism);
}
 
Example 60
Project: StreamBench   File: SparkOperatorCreater.java   Source Code and License 5 votes vote down vote up
@Override
public WorkloadOperator<String> stringStreamFromKafka(String zkConStr,
                                                      String kafkaServers,
                                                      String group,
                                                      String topics,
                                                      String offset,
                                                      String componentId,
                                                      int parallelism) {
    HashSet<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", kafkaServers);
    kafkaParams.put("auto.offset.reset", offset);
    kafkaParams.put("zookeeper.connect", zkConStr);
    kafkaParams.put("group.id", group);

    // Create direct kafka stream with brokers and topics
    JavaPairDStream<String, String> messages = KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet
    );

    JavaDStream<String> lines = messages.map(mapFunction);

    return new SparkWorkloadOperator<>(lines, parallelism);
}