org.apache.spark.streaming.api.java.JavaStreamingContext Java Examples

The following examples show how to use org.apache.spark.streaming.api.java.JavaStreamingContext. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #2
Source File: Window.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("window").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");

    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaDStream<String> winDstream = dStream.window(Durations.seconds(30), Durations.seconds(20));

    JavaDStream<Long> result = winDstream.count();

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #3
Source File: StreamingRsvpsDStreamCountWindow.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 6 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI)
                .set("spark.streaming.kafka.consumer.cache.enabled", "false");

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        streamingContext.checkpoint(CHECKPOINT_FOLDER);

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<Long> countStream  
            = meetupStream.countByWindow(
                 new Duration(WINDOW_LENGTH_MS), 
                 new Duration(SLIDING_INTERVAL_MS));        

        countStream.foreachRDD((JavaRDD<Long> countRDD) -> {                
            MongoSpark.save(        
                    countRDD.map(
                        r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}")
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });
        
        streamingContext.start();
        streamingContext.awaitTermination();    
    }
 
Example #4
Source File: SparkRunnerStreamingContextFactory.java    From beam with Apache License 2.0 6 votes vote down vote up
private void checkpoint(JavaStreamingContext jssc, CheckpointDir checkpointDir) {
  Path rootCheckpointPath = checkpointDir.getRootCheckpointDir();
  Path sparkCheckpointPath = checkpointDir.getSparkCheckpointDir();
  Path beamCheckpointPath = checkpointDir.getBeamCheckpointDir();

  try {
    FileSystem fileSystem =
        rootCheckpointPath.getFileSystem(jssc.sparkContext().hadoopConfiguration());
    if (!fileSystem.exists(rootCheckpointPath)) {
      fileSystem.mkdirs(rootCheckpointPath);
    }
    if (!fileSystem.exists(sparkCheckpointPath)) {
      fileSystem.mkdirs(sparkCheckpointPath);
    }
    if (!fileSystem.exists(beamCheckpointPath)) {
      fileSystem.mkdirs(beamCheckpointPath);
    }
  } catch (IOException e) {
    throw new RuntimeException("Failed to create checkpoint dir", e);
  }

  jssc.checkpoint(sparkCheckpointPath.toString());
}
 
Example #5
Source File: SparkStreamingBinding.java    From datacollector with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public JavaStreamingContext create() {
  sparkConf.set("spark.streaming.kafka.maxRatePerPartition", String.valueOf(maxRatePerPartition));
  // Use our classpath first, since we ship a newer version of Jackson and possibly other deps in the future.
  sparkConf.set("spark.driver.userClassPathFirst", "true");
  sparkConf.set("spark.executor.userClassPathFirst", "true");

  session = SparkSession.builder().config(sparkConf).getOrCreate();

  JavaStreamingContext result =
      new JavaStreamingContext(new JavaSparkContext(session.sparkContext()), new Duration(duration));
  Map<String, Object> props = new HashMap<>();

  props.put("group.id", groupId);
  props.put("key.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
  props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
  for (Map.Entry<String, Object> map : props.entrySet()) {
    logMessage(Utils.format("Adding extra kafka config, {}:{}", map.getKey(), map.getValue()), isRunningInMesos);
  }

  logMessage("Meta data broker list " + metaDataBrokerList, isRunningInMesos);
  logMessage("Topic is " + topic, isRunningInMesos);
  logMessage("Auto offset reset is set to " + autoOffsetValue, isRunningInMesos);
  return createDStream(result, props);
}
 
Example #6
Source File: ReduceByKeyAndWindow.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #7
Source File: SparkStreamDemo.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //创建两个核心的本地线程,批处理的间隔为1秒
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamIng");
    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Durations.seconds(1));
    //创建一个连接到IP:localhost,PORT:8080的DStream
    JavaReceiverInputDStream<String> dStream = javaStreamingContext.socketTextStream("localhost", 8080);
    JavaDStream<String> errorLine = dStream.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return v1.contains("error");
        }
    });
    //打印包含error的行
    errorLine.print();
    try {
        //开始计算
        javaStreamingContext.start();
        //等待计算完成
        javaStreamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #8
Source File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new RowProcessor());

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example #9
Source File: StreamingIngestionFileSystemTextFileApp.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "NetworkWordCount");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());
  msgDataStream.print();

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example #10
Source File: WordCountRecoverableEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	System.setProperty("hadoop.home.dir", "E:\\hadoop");

	final String ip = "10.0.75.1";
	final int port = Integer.parseInt("9000");
	final String checkpointDirectory = "E:\\hadoop\\checkpoint";
	// Function to create JavaStreamingContext without any output operations
	// (used to detect the new context)
	Function0<JavaStreamingContext> createContextFunc = new Function0<JavaStreamingContext>() {
		@Override
		public JavaStreamingContext call() {
			return createContext(ip, port, checkpointDirectory);
		}
	};

	JavaStreamingContext ssc = JavaStreamingContext.getOrCreate(checkpointDirectory, createContextFunc);
	ssc.start();
	ssc.awaitTermination();
}
 
Example #11
Source File: StreamingService.java    From cxf with Apache License 2.0 6 votes vote down vote up
private void processStreamOneWay(List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect OneWay " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new PrintOutputFunction(jssc));
        jssc.start();
    } catch (Exception ex) {
        // ignore
    }
}
 
Example #12
Source File: SparkStreamServiceImpl.java    From searchanalytics-bigdata with MIT License 6 votes vote down vote up
@Override
public void setup() {
	// Create a StreamingContext with a SparkConf configuration
	SparkConf sparkConf = new SparkConf(false)
			.setAppName("JaiSpark")
			.setSparkHome("target/sparkhome")
			.setMaster("local")
			.set("spark.executor.memory", "128m")
			.set("spark.local.dir",
					new File("target/sparkhome/tmp").getAbsolutePath())
			.set("spark.cores.max", "2").set("spark.akka.threads", "2")
			.set("spark.akka.timeout", "60").set("spark.logConf", "true")
			.set("spark.cleaner.delay", "3700")
			.set("spark.cleaner.ttl", "86400")
			.set("spark.shuffle.spill", "false")
			.set("spark.driver.host", "localhost")
			.set("spark.driver.port", "43214");
	jssc = new JavaStreamingContext(sparkConf, new Duration(5000));

	String checkpointDir = hadoopClusterService.getHDFSUri()
			+ "/sparkcheckpoint";
	jssc.checkpoint(checkpointDir);
	startFlumeStream();
}
 
Example #13
Source File: BatchUpdateFunction.java    From oryx with Apache License 2.0 6 votes vote down vote up
BatchUpdateFunction(Config config,
                    Class<K> keyClass,
                    Class<M> messageClass,
                    Class<? extends Writable> keyWritableClass,
                    Class<? extends Writable> messageWritableClass,
                    String dataDirString,
                    String modelDirString,
                    BatchLayerUpdate<K,M,U> updateInstance,
                    JavaStreamingContext streamingContext) {
  this.keyClass = keyClass;
  this.messageClass = messageClass;
  this.keyWritableClass = keyWritableClass;
  this.messageWritableClass = messageWritableClass;
  this.dataDirString = dataDirString;
  this.modelDirString = modelDirString;
  this.updateBroker = ConfigUtils.getOptionalString(config, "oryx.update-topic.broker");
  this.updateTopic = ConfigUtils.getOptionalString(config, "oryx.update-topic.message.topic");
  this.updateInstance = updateInstance;
  this.sparkContext = streamingContext.sparkContext();
}
 
Example #14
Source File: SparkStreaming.java    From kafka-spark-avro-example with Apache License 2.0 6 votes vote down vote up
private static void processStream(JavaStreamingContext ssc, JavaSparkContext sc) {
  System.out.println("--> Processing stream");

  Map<String, String> props = new HashMap<>();
  props.put("bootstrap.servers", "localhost:9092");
  props.put("schema.registry.url", "http://localhost:8081");
  props.put("group.id", "spark");
  props.put("specific.avro.reader", "true");

  props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer");
  props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

  Set<String> topicsSet = new HashSet<>(Collections.singletonList("test"));

  JavaPairInputDStream<String, Object> stream = KafkaUtils.createDirectStream(ssc, String.class, Object.class,
    StringDecoder.class, KafkaAvroDecoder.class, props, topicsSet);

  stream.foreachRDD(rdd -> {
    rdd.foreachPartition(iterator -> {
        while (iterator.hasNext()) {
          Tuple2<String, Object> next = iterator.next();
          Model model = (Model) next._2();
          System.out.println(next._1() + " --> " + model);
        }
      }
    );
  });
}
 
Example #15
Source File: StreamingEngine.java    From spark-streaming-direct-kafka with Apache License 2.0 6 votes vote down vote up
public void start() {
    SparkConf sparkConf = getSparkConf();
    streamingContext = new JavaStreamingContext(sparkConf,
            Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec())));
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext);
    JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message()));

    pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data
    dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager()));
    streamingContext.start();
}
 
Example #16
Source File: StreamingContextConfiguration.java    From Decision with Apache License 2.0 5 votes vote down vote up
private void configureDataContext(JavaStreamingContext context) {
    Map<String, Integer> baseTopicMap = new HashMap<>();


    configurationContext.getDataTopics().forEach( dataTopic -> baseTopicMap.put(dataTopic, 1));

    kafkaTopicService.createTopicsIfNotExist(configurationContext.getDataTopics(), configurationContext
            .getKafkaReplicationFactor(), configurationContext.getKafkaPartitions());

    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath());
    kafkaParams.put("group.id", configurationContext.getGroupId());
     /*
     groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of
      the group.
     Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the
     same topic with the same groupId, only one instance will be able to read from the topic
     */
    JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class,
            kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap,
            StorageLevel.MEMORY_AND_DISK_SER());

    AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction();
    JavaDStream<StratioStreamingMessage>  insertRequests = messages.filter(
            new FilterAvroMessagesByOperationFunction(STREAM_OPERATIONS.MANIPULATION.INSERT))
            .map(avroDeserializeMessageFunction);

    InsertIntoStreamFunction insertIntoStreamFunction = new InsertIntoStreamFunction(streamOperationService,
            configurationContext.getZookeeperHostsQuorum());
    insertRequests.foreachRDD(insertIntoStreamFunction);

}
 
Example #17
Source File: AbstractSparkLayer.java    From oryx with Apache License 2.0 5 votes vote down vote up
protected final JavaInputDStream<ConsumerRecord<K,M>> buildInputDStream(
    JavaStreamingContext streamingContext) {

  Preconditions.checkArgument(
      KafkaUtils.topicExists(inputTopicLockMaster, inputTopic),
      "Topic %s does not exist; did you create it?", inputTopic);
  if (updateTopic != null && updateTopicLockMaster != null) {
    Preconditions.checkArgument(
        KafkaUtils.topicExists(updateTopicLockMaster, updateTopic),
        "Topic %s does not exist; did you create it?", updateTopic);
  }

  String groupID = getGroupID();

  Map<String,Object> kafkaParams = new HashMap<>();
  kafkaParams.put("group.id", groupID);
  // Don't re-consume old messages from input by default
  kafkaParams.put("auto.offset.reset", "latest"); // Ignored by Kafka 0.10 Spark integration
  kafkaParams.put("bootstrap.servers", inputBroker);
  kafkaParams.put("key.deserializer", keyDecoderClass.getName());
  kafkaParams.put("value.deserializer", messageDecoderClass.getName());

  LocationStrategy locationStrategy = LocationStrategies.PreferConsistent();
  ConsumerStrategy<K,M> consumerStrategy = ConsumerStrategies.Subscribe(
      Collections.singleton(inputTopic), kafkaParams, Collections.emptyMap());
  return org.apache.spark.streaming.kafka010.KafkaUtils.createDirectStream(
      streamingContext,
      locationStrategy,
      consumerStrategy);
}
 
Example #18
Source File: StreamingContextConfigurationTests.java    From Decision with Apache License 2.0 5 votes vote down vote up
@Test
public void testActionBaseFunctionCall() throws Exception {
    //        sc.sparkContext().emptyRDD().rdd().first();
    //        ssc.sparkContext().emptyRDD().rdd().first();
    assertEquals(sc instanceof JavaStreamingContext, false);
    assertEquals(ssc.sparkContext().appName(), "magic");

}
 
Example #19
Source File: ReceiverLauncher.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
private static <E> void assignReceiversToPartitions(int numberOfReceivers, 
        int numberOfPartition, List<JavaDStream<MessageAndMetadata<E>>> streamsList, 
        KafkaConfig config, StorageLevel storageLevel, KafkaMessageHandler<E> messageHandler, JavaStreamingContext jsc ) {

    // Create as many Receiver as Partition
    if (numberOfReceivers >= numberOfPartition) {
        for (int i = 0; i < numberOfPartition; i++) {
            streamsList.add(jsc.receiverStream(new KafkaReceiver(
                    config, i, storageLevel, messageHandler)));
        }
    } else {
        // create Range Receivers..
        Map<Integer, Set<Integer>> rMap = new HashMap<Integer, Set<Integer>>();

        for (int i = 0; i < numberOfPartition; i++) {
            int j = i % numberOfReceivers;
            Set<Integer> pSet = rMap.get(j);
            if (pSet == null) {
                pSet = new HashSet<Integer>();
                pSet.add(i);
            } else {
                pSet.add(i);
            }
            rMap.put(j, pSet);
        }
        for (int i = 0; i < numberOfReceivers; i++) {
            streamsList.add(jsc.receiverStream(new KafkaRangeReceiver(config, rMap
                    .get(i), storageLevel, messageHandler)));
        }
    }
}
 
Example #20
Source File: BatchLayer.java    From oryx with Apache License 2.0 5 votes vote down vote up
public void await() throws InterruptedException {
  JavaStreamingContext theStreamingContext;
  synchronized (this) {
    theStreamingContext = streamingContext;
    Preconditions.checkState(theStreamingContext != null);
  }
  log.info("Spark Streaming is running");
  theStreamingContext.awaitTermination(); // Can't do this with lock
}
 
Example #21
Source File: TrackStreamingSourcesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private StreamingSourceTracker(
    JavaStreamingContext jssc,
    Pipeline pipeline,
    Class<? extends PTransform> transformClassToAssert,
    Integer... expected) {
  this.ctxt = new EvaluationContext(jssc.sparkContext(), pipeline, options, jssc);
  this.evaluator =
      new SparkRunner.Evaluator(
          new StreamingTransformTranslator.Translator(new TransformTranslator.Translator()),
          ctxt);
  this.transformClassToAssert = transformClassToAssert;
  this.expected = expected;
}
 
Example #22
Source File: StreamingContextConfiguration.java    From Decision with Apache License 2.0 5 votes vote down vote up
@Bean(name = "streamingContext", destroyMethod = "stop")
public JavaStreamingContext streamingContext() {
    JavaStreamingContext context = this.create("stratio-streaming-context", 4040,
            configurationContext.getInternalStreamingBatchTime(), configurationContext.getInternalSparkHost());

    configureRequestContext(context);
    configureActionContext(context);
    configureDataContext(context);

    return context;
}
 
Example #23
Source File: WordCountRecoverableEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
	SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
	JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
	streamingContext.checkpoint(checkpointDirectory);
	// Initial state RDD input to mapWithState
	@SuppressWarnings("unchecked")
	List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
	JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);

	JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);

	JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
			.reduceByKey((count1, count2) -> count1 + count2);

	// Update the cumulative count function
	Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
		@Override
		public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
			int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
			Tuple2<String, Integer> output = new Tuple2<>(word, sum);
			state.update(sum);
			return output;
		}
	};

	// DStream made of get cumulative counts that get updated in every batch
	JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
			.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

	stateDstream.print();
	return streamingContext;
}
 
Example #24
Source File: Kafka010SparkStreamingBinding.java    From datacollector with Apache License 2.0 5 votes vote down vote up
@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  props.put("bootstrap.servers", metaDataBrokerList);
  if (!autoOffsetValue.isEmpty()) {
    autoOffsetValue = getConfigurableAutoOffsetResetIfNonEmpty(autoOffsetValue);
    props.put(AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  List<String> topics = ImmutableList.of(topic);
  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = KafkaOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), KafkaOffsetManagerImpl.get());
  return result;
}
 
Example #25
Source File: SparkStreamingJob.java    From zipkin-sparkstreaming with Apache License 2.0 5 votes vote down vote up
@Memoized
JavaStreamingContext jsc() {
  SparkConf conf = new SparkConf(true)
      .setMaster(master())
      .setAppName(getClass().getName());
  if (!jars().isEmpty()) conf.setJars(jars().toArray(new String[0]));
  for (Map.Entry<String, String> entry : conf().entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
  return new JavaStreamingContext(conf, new Duration(batchDuration()));
}
 
Example #26
Source File: MapRStreamingBinding.java    From datacollector with Apache License 2.0 5 votes vote down vote up
@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  List<String> topics = ImmutableList.of(topic);
  if (!autoOffsetValue.isEmpty()) {
    props.put(SparkStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), MaprStreamsOffsetManagerImpl.get());
  return result;
}
 
Example #27
Source File: ReaderWriterExample.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        SparkSession spark = SparkSession.builder().getOrCreate();

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }
 
Example #28
Source File: SparkStreaming.java    From kafka-spark-avro-example with Apache License 2.0 5 votes vote down vote up
public static void main(String... args) {
  SparkConf conf = new SparkConf();
  conf.setMaster("local[2]");
  conf.setAppName("Spark Streaming Test Java");

  JavaSparkContext sc = new JavaSparkContext(conf);
  JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(10));

  processStream(ssc, sc);

  ssc.start();
  ssc.awaitTermination();
}
 
Example #29
Source File: SparkStreamingSqlAnalyse.java    From sylph with Apache License 2.0 5 votes vote down vote up
public SparkStreamingSqlAnalyse(StreamingContext ssc,
        ConnectorStore connectorStore,
        boolean isCompile)
{
    this.ssc = ssc;
    this.connectorStore = connectorStore;
    this.sparkBean = binder -> {
        binder.bind(StreamingContext.class, ssc);
        binder.bind(JavaStreamingContext.class, new JavaStreamingContext(ssc));
    };
    this.isCompile = isCompile;
}
 
Example #30
Source File: WordCountSocketJava8Ex.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}