Java Code Examples for org.apache.spark.api.java.JavaPairRDD#isEmpty()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#isEmpty() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SaveToHDFSFunction.java    From oryx with Apache License 2.0 6 votes vote down vote up
@Override
public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException {
  if (rdd.isEmpty()) {
    log.info("RDD was empty, not saving to HDFS");
  } else {
    String file = prefix + '-' + time.milliseconds() + '.' + suffix;
    Path path = new Path(file);
    FileSystem fs = FileSystem.get(path.toUri(), hadoopConf);
    if (fs.exists(path)) {
      log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path);
      fs.delete(path, true);
    }
    log.info("Saving RDD to HDFS at {}", file);
    rdd.mapToPair(
        new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)
    ).saveAsNewAPIHadoopFile(
        file,
        keyWritableClass,
        messageWritableClass,
        SequenceFileOutputFormat.class,
        hadoopConf);
  }
}
 
Example 2
Source File: Algorithm.java    From predictionio-template-java-ecom-recommender with Apache License 2.0 5 votes vote down vote up
@Override
public PredictedResult predict(Model model, final Query query) {
    final JavaPairRDD<String, Integer> matchedUser = model.getUserIndex().filter(new Function<Tuple2<String, Integer>, Boolean>() {
        @Override
        public Boolean call(Tuple2<String, Integer> userIndex) throws Exception {
            return userIndex._1().equals(query.getUserEntityId());
        }
    });

    double[] userFeature = null;
    if (!matchedUser.isEmpty()) {
        final Integer matchedUserIndex = matchedUser.first()._2();
        userFeature = model.getUserFeatures().filter(new Function<Tuple2<Integer, double[]>, Boolean>() {
            @Override
            public Boolean call(Tuple2<Integer, double[]> element) throws Exception {
                return element._1().equals(matchedUserIndex);
            }
        }).first()._2();
    }

    if (userFeature != null) {
        return new PredictedResult(topItemsForUser(userFeature, model, query));
    } else {
        List<double[]> recentProductFeatures = getRecentProductFeatures(query, model);
        if (recentProductFeatures.isEmpty()) {
            return new PredictedResult(mostPopularItems(model, query));
        } else {
            return new PredictedResult(similarItems(recentProductFeatures, model, query));
        }
    }
}
 
Example 3
Source File: SpeedLayerUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
@Override
public void call(JavaPairRDD<K,M> newData) throws IOException {
  if (newData.isEmpty()) {
    log.debug("RDD was empty");
  } else {
    Iterable<U> updates = modelManager.buildUpdates(newData);
    if (updates != null) {
      try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic, true)) {
        updates.forEach(update -> producer.send("UP", update));
      }
    }
  }
}
 
Example 4
Source File: BatchUpdateFunction.java    From oryx with Apache License 2.0 4 votes vote down vote up
@Override
public void call(JavaPairRDD<K,M> newData, Time timestamp)
    throws IOException, InterruptedException {

  if (newData.isEmpty()) {
    log.info("No data in current generation's RDD; nothing to do");
    return;
  }

  log.info("Beginning update at {}", timestamp);

  Configuration hadoopConf = sparkContext.hadoopConfiguration();
  if (hadoopConf.getResource("core-site.xml") == null) {
    log.warn("Hadoop config like core-site.xml was not found; " +
             "is the Hadoop config directory on the classpath?");
  }

  JavaPairRDD<K,M> pastData;
  Path inputPathPattern = new Path(dataDirString + "/*/part-*");
  FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf);
  FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);
  if (inputPathStatuses == null || inputPathStatuses.length == 0) {

    log.info("No past data at path(s) {}", inputPathPattern);
    pastData = null;

  } else {

    log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
    Configuration updatedConf = new Configuration(hadoopConf);
    updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));

    @SuppressWarnings("unchecked")
    JavaPairRDD<Writable,Writable> pastWritableData = (JavaPairRDD<Writable,Writable>)
        sparkContext.newAPIHadoopRDD(updatedConf,
                                     SequenceFileInputFormat.class,
                                     keyWritableClass,
                                     messageWritableClass);

    pastData = pastWritableData.mapToPair(
        new WritableToValueFunction<>(keyClass,
                                      messageClass,
                                      keyWritableClass,
                                      messageWritableClass));
  }

  if (updateTopic == null || updateBroker == null) {
    log.info("Not producing updates to update topic since none was configured");
    updateInstance.runUpdate(sparkContext,
                             timestamp.milliseconds(),
                             newData,
                             pastData,
                             modelDirString,
                             null);
  } else {
    // This TopicProducer should not be async; sends one big model generally and
    // needs to occur before other updates reliably rather than be buffered
    try (TopicProducer<String,U> producer =
             new TopicProducerImpl<>(updateBroker, updateTopic, false)) {
      updateInstance.runUpdate(sparkContext,
                               timestamp.milliseconds(),
                               newData,
                               pastData,
                               modelDirString,
                               producer);
    }
  }
}
 
Example 5
Source File: BaseActionExecutionFunction.java    From Decision with Apache License 2.0 4 votes vote down vote up
@Override
public Void call(JavaPairRDD<StreamAction, Iterable<StratioStreamingMessage>> rdd) throws Exception {


    if (!rdd.isEmpty()) {

        rdd.mapPartitions(
                new FlatMapFunction<Iterator<Tuple2<StreamAction, Iterable<StratioStreamingMessage>>>, Object>() {

                    @Override public Iterable<Object> call(
                            Iterator<Tuple2<StreamAction, Iterable<StratioStreamingMessage>>> tuple2Iterator)
                            throws Exception {

                        while (tuple2Iterator.hasNext()) {
                            process(tuple2Iterator.next()._2());
                        }

                        return new ArrayList<Object>();
                    }
                }).count();
    }


    return null;
}