org.apache.spark.streaming.Time Java Examples

The following examples show how to use org.apache.spark.streaming.Time. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SaveToHDFSFunction.java    From oryx with Apache License 2.0 6 votes vote down vote up
@Override
public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException {
  if (rdd.isEmpty()) {
    log.info("RDD was empty, not saving to HDFS");
  } else {
    String file = prefix + '-' + time.milliseconds() + '.' + suffix;
    Path path = new Path(file);
    FileSystem fs = FileSystem.get(path.toUri(), hadoopConf);
    if (fs.exists(path)) {
      log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path);
      fs.delete(path, true);
    }
    log.info("Saving RDD to HDFS at {}", file);
    rdd.mapToPair(
        new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)
    ).saveAsNewAPIHadoopFile(
        file,
        keyWritableClass,
        messageWritableClass,
        SequenceFileOutputFormat.class,
        hadoopConf);
  }
}
 
Example #2
Source File: BlurBulkLoadSparkProcessor.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
@Override
protected Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction() {
  return new Function2<JavaPairRDD<String, RowMutation>, Time, Void>() {
    // Blur Thrift Client
    @Override
    public Void call(JavaPairRDD<String, RowMutation> rdd, Time time) throws Exception {
      Iface client = getBlurClient();
      for (Tuple2<String, RowMutation> tuple : rdd.collect()) {
        if (tuple != null) {
          try {
            RowMutation rm = tuple._2;
            // Index using enqueue mutate call
            client.enqueueMutate(rm);
          } catch (Exception ex) {
            LOG.error("Unknown error while trying to call enqueueMutate.", ex);
            throw ex;
          }
        }
      }
      return null;
    }
  };
}
 
Example #3
Source File: WatermarkSyncedDStream.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public scala.Option<RDD<WindowedValue<T>>> compute(final Time validTime) {
  final long batchTime = validTime.milliseconds();

  LOG.trace(
      "BEFORE waiting for watermark sync, "
          + "LastWatermarkedBatchTime: {}, current batch time: {}",
      GlobalWatermarkHolder.getLastWatermarkedBatchTime(),
      batchTime);

  final Stopwatch stopwatch = Stopwatch.createStarted();

  awaitWatermarkSyncWith(batchTime);

  stopwatch.stop();

  LOG.info(
      "Waited {} millis for watermarks to sync up with the current batch ({})",
      stopwatch.elapsed(TimeUnit.MILLISECONDS),
      batchTime);

  LOG.info("Watermarks are now: {}", GlobalWatermarkHolder.get(batchDuration));

  LOG.trace(
      "AFTER waiting for watermark sync, "
          + "LastWatermarkedBatchTime: {}, current batch time: {}",
      GlobalWatermarkHolder.getLastWatermarkedBatchTime(),
      batchTime);

  final RDD<WindowedValue<T>> rdd = generateRdd();
  isFirst = false;
  return scala.Option.apply(rdd);
}
 
Example #4
Source File: SourceDStream.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public scala.Option<RDD<Tuple2<Source<T>, CheckpointMarkT>>> compute(Time validTime) {
  RDD<Tuple2<Source<T>, CheckpointMarkT>> rdd =
      new SourceRDD.Unbounded<>(
          ssc().sparkContext(), options, createMicrobatchSource(), numPartitions);
  return scala.Option.apply(rdd);
}
 
Example #5
Source File: SparkUnboundedSource.java    From beam with Apache License 2.0 5 votes vote down vote up
private void report(Time batchTime, long count, SparkWatermarks sparkWatermark) {
  // metadata - #records read and a description.
  scala.collection.immutable.Map<String, Object> metadata =
      new scala.collection.immutable.Map.Map1<>(
          StreamInputInfo.METADATA_KEY_DESCRIPTION(),
          String.format(
              "Read %d records with observed watermarks %s, from %s for batch time: %s",
              count, sparkWatermark == null ? "N/A" : sparkWatermark, sourceName, batchTime));
  StreamInputInfo streamInputInfo = new StreamInputInfo(inputDStreamId, count, metadata);
  ssc().scheduler().inputInfoTracker().reportInfo(batchTime, streamInputInfo);
}
 
Example #6
Source File: SylphKafkaOffset.java    From sylph with Apache License 2.0 4 votes vote down vote up
@Override
public Option<RDD<T>> compute(Time validTime)
{
    return parent.getOrCompute(validTime);
}
 
Example #7
Source File: KafkaSource08.java    From sylph with Apache License 2.0 4 votes vote down vote up
private static JavaDStream<ConsumerRecord<byte[], byte[]>> settingCommit(
            JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream,
            Map<String, String> kafkaParams,
            KafkaCluster kafkaCluster,
            String groupId)
    {
        if (kafkaParams.getOrDefault("auto.commit.enable", "true").equals("false")) {
            return inputStream;
        }

        int commitInterval = Integer.parseInt(kafkaParams.getOrDefault(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "90000"));

        DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream())
        {
            private final KafkaOffsetCommitter kafkaOffsetCommitter = new KafkaOffsetCommitter(
                    kafkaCluster,
                    groupId,
                    commitInterval);

            @Override
            public void initialize(Time time)
            {
                super.initialize(time);
                kafkaOffsetCommitter.setName("Kafka_Offset_Committer");
                kafkaOffsetCommitter.start();
            }

            @Override
            public void commitOffsets(RDD<?> kafkaRdd)
            {
                OffsetRange[] offsets = ((HasOffsetRanges) kafkaRdd).offsetRanges();
//                Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets)
//                        .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset()));
                //log().info("commit Kafka Offsets {}", internalOffsets);
                kafkaOffsetCommitter.addAll(offsets);
            }
        };
        JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = new JavaDStream<>(
                sylphKafkaOffset, ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class));
        return dStream;
//        inputStream = inputStream.transform(rdd -> {
//            OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
//            Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets)
//                    .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset()));
//            commitKafkaOffsets(kafkaCluster, groupId, internalOffsets);
//            return rdd;
//        });
    }
 
Example #8
Source File: JavaRecoverableNetworkWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
private static JavaStreamingContext createContext(String ip,
                                                  int port,
                                                  String checkpointDirectory,
                                                  String outputPath) {

  // If you do not see this printed, that means the StreamingContext has been loaded
  // from the new checkpoint
  System.out.println("Creating new context");
  final File outputFile = new File(outputPath);
  if (outputFile.exists()) {
    outputFile.delete();
  }
  SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount");
  // Create the context with a 1 second batch size
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  ssc.checkpoint(checkpointDirectory);

  // Create a socket stream on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port);
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override
      public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() {
    @Override
    public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException {
      // Get or register the blacklist Broadcast
      final Broadcast<List<String>> blacklist =
          JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context()));
      // Get or register the droppedWordsCounter Accumulator
      final LongAccumulator droppedWordsCounter =
          JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context()));
      // Use blacklist to drop words and use droppedWordsCounter to count them
      String counts = rdd.filter(new Function<Tuple2<String, Integer>, Boolean>() {
        @Override
        public Boolean call(Tuple2<String, Integer> wordCount) {
          if (blacklist.value().contains(wordCount._1())) {
            droppedWordsCounter.add(wordCount._2());
            return false;
          } else {
            return true;
          }
        }
      }).collect().toString();
      String output = "Counts at time " + time + " " + counts;
      System.out.println(output);
      System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally");
      System.out.println("Appending to " + outputFile.getAbsolutePath());
      Files.append(output + "\n", outputFile, Charset.defaultCharset());
    }
  });

  return ssc;
}
 
Example #9
Source File: JavaSqlNetworkWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

  // Create a JavaReceiverInputDStream on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  // Note that no duplication in storage level only for running locally.
  // Replication necessary in distributed scenario for fault tolerance.
  JavaReceiverInputDStream<String> lines = ssc.socketTextStream(
      args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER);
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });

  // Convert RDDs of the words DStream to DataFrame and run SQL query
  words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() {
    @Override
    public void call(JavaRDD<String> rdd, Time time) {
      SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());

      // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame
      JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() {
        @Override
        public JavaRecord call(String word) {
          JavaRecord record = new JavaRecord();
          record.setWord(word);
          return record;
        }
      });
      Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class);

      // Creates a temporary view using the DataFrame
      wordsDataFrame.createOrReplaceTempView("words");

      // Do word count on table using SQL and print it
      Dataset<Row> wordCountsDataFrame =
          spark.sql("select word, count(*) as total from words group by word");
      System.out.println("========= " + time + "=========");
      wordCountsDataFrame.show();
    }
  });

  ssc.start();
  ssc.awaitTermination();
}
 
Example #10
Source File: SparkUnboundedSource.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public scala.Option<RDD<BoxedUnit>> compute(Time validTime) {
  // compute parent.
  scala.Option<RDD<Metadata>> parentRDDOpt = parent.getOrCompute(validTime);
  final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
  long count = 0;
  SparkWatermarks sparkWatermark = null;
  Instant globalLowWatermarkForBatch = BoundedWindow.TIMESTAMP_MIN_VALUE;
  Instant globalHighWatermarkForBatch = BoundedWindow.TIMESTAMP_MIN_VALUE;
  long maxReadDuration = 0;
  if (parentRDDOpt.isDefined()) {
    JavaRDD<Metadata> parentRDD = parentRDDOpt.get().toJavaRDD();
    for (Metadata metadata : parentRDD.collect()) {
      count += metadata.getNumRecords();
      // compute the global input watermark - advance to latest of all partitions.
      Instant partitionLowWatermark = metadata.getLowWatermark();
      globalLowWatermarkForBatch =
          globalLowWatermarkForBatch.isBefore(partitionLowWatermark)
              ? partitionLowWatermark
              : globalLowWatermarkForBatch;
      Instant partitionHighWatermark = metadata.getHighWatermark();
      globalHighWatermarkForBatch =
          globalHighWatermarkForBatch.isBefore(partitionHighWatermark)
              ? partitionHighWatermark
              : globalHighWatermarkForBatch;
      // Update metrics reported in the read
      final Gauge gauge = Metrics.gauge(NAMESPACE, READ_DURATION_MILLIS);
      final MetricsContainer container = metadata.getMetricsContainers().getContainer(stepName);
      try (Closeable ignored = MetricsEnvironment.scopedMetricsContainer(container)) {
        final long readDurationMillis = metadata.getReadDurationMillis();
        if (readDurationMillis > maxReadDuration) {
          gauge.set(readDurationMillis);
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      metricsAccum.value().updateAll(metadata.getMetricsContainers());
    }

    sparkWatermark =
        new SparkWatermarks(
            globalLowWatermarkForBatch,
            globalHighWatermarkForBatch,
            new Instant(validTime.milliseconds()));
    // add to watermark queue.
    GlobalWatermarkHolder.add(inputDStreamId, sparkWatermark);
  }
  // report - for RateEstimator and visibility.
  report(validTime, count, sparkWatermark);
  return scala.Option.empty();
}
 
Example #11
Source File: SparkStreamingFromFlumeToHBaseWindowingExample.java    From SparkOnALog with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	int windowInSeconds = Integer.parseInt(args[5]);
	int slideInSeconds = Integer.parseInt(args[5]);
	
	Duration batchInterval = new Duration(2000);
	Duration windowInterval = new Duration(windowInSeconds * 1000);
	Duration slideInterval = new Duration(slideInSeconds * 1000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			}, windowInterval, slideInterval);
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}
 
Example #12
Source File: SparkStreamingFromFlumeToHBaseExample.java    From SparkOnALog with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseExample {master} {host} {port} {table} {columnFamily}");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	
	Duration batchInterval = new Duration(2000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKey(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			});
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}
 
Example #13
Source File: BatchUpdateFunction.java    From oryx with Apache License 2.0 4 votes vote down vote up
@Override
public void call(JavaPairRDD<K,M> newData, Time timestamp)
    throws IOException, InterruptedException {

  if (newData.isEmpty()) {
    log.info("No data in current generation's RDD; nothing to do");
    return;
  }

  log.info("Beginning update at {}", timestamp);

  Configuration hadoopConf = sparkContext.hadoopConfiguration();
  if (hadoopConf.getResource("core-site.xml") == null) {
    log.warn("Hadoop config like core-site.xml was not found; " +
             "is the Hadoop config directory on the classpath?");
  }

  JavaPairRDD<K,M> pastData;
  Path inputPathPattern = new Path(dataDirString + "/*/part-*");
  FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf);
  FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);
  if (inputPathStatuses == null || inputPathStatuses.length == 0) {

    log.info("No past data at path(s) {}", inputPathPattern);
    pastData = null;

  } else {

    log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
    Configuration updatedConf = new Configuration(hadoopConf);
    updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));

    @SuppressWarnings("unchecked")
    JavaPairRDD<Writable,Writable> pastWritableData = (JavaPairRDD<Writable,Writable>)
        sparkContext.newAPIHadoopRDD(updatedConf,
                                     SequenceFileInputFormat.class,
                                     keyWritableClass,
                                     messageWritableClass);

    pastData = pastWritableData.mapToPair(
        new WritableToValueFunction<>(keyClass,
                                      messageClass,
                                      keyWritableClass,
                                      messageWritableClass));
  }

  if (updateTopic == null || updateBroker == null) {
    log.info("Not producing updates to update topic since none was configured");
    updateInstance.runUpdate(sparkContext,
                             timestamp.milliseconds(),
                             newData,
                             pastData,
                             modelDirString,
                             null);
  } else {
    // This TopicProducer should not be async; sends one big model generally and
    // needs to occur before other updates reliably rather than be buffered
    try (TopicProducer<String,U> producer =
             new TopicProducerImpl<>(updateBroker, updateTopic, false)) {
      updateInstance.runUpdate(sparkContext,
                               timestamp.milliseconds(),
                               newData,
                               pastData,
                               modelDirString,
                               producer);
    }
  }
}
 
Example #14
Source File: BlurLoadSparkProcessor.java    From incubator-retired-blur with Apache License 2.0 votes vote down vote up
protected abstract Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction();