Java Code Examples for org.apache.spark.api.java.JavaSparkContext#newAPIHadoopRDD()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#newAPIHadoopRDD() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BigQueryWordCountToBigQuery.java    From spark-on-k8s-gcp-examples with Apache License 2.0 6 votes vote down vote up
private static void compute(JavaSparkContext javaSparkContext, Configuration conf) {
  JavaPairRDD<LongWritable, JsonObject> tableData = javaSparkContext.newAPIHadoopRDD(
      conf,
      GsonBigQueryInputFormat.class,
      LongWritable.class,
      JsonObject.class);
  JavaPairRDD<String, Long> wordCounts = tableData
      .map(entry -> toTuple(entry._2))
      .keyBy(tuple -> tuple._1)
      .mapValues(tuple -> tuple._2)
      .reduceByKey((count1, count2) -> count1 + count2);
  wordCounts
      .map(tuple -> new Text(toJson(tuple).toString()))
      .keyBy(jsonText -> jsonText)
      .mapValues(jsonText -> NullWritable.get()) // Values do not matter.
      .saveAsNewAPIHadoopDataset(conf);
}
 
Example 2
Source File: BigQueryWordCountToGCS.java    From spark-on-k8s-gcp-examples with Apache License 2.0 6 votes vote down vote up
private static void compute(JavaSparkContext javaSparkContext, Configuration conf,
    Path gcsOutputPath) {
  JavaPairRDD<LongWritable, JsonObject> tableData = javaSparkContext.newAPIHadoopRDD(
      conf,
      GsonBigQueryInputFormat.class,
      LongWritable.class,
      JsonObject.class);
  JavaPairRDD<String, Long> wordCount = tableData
      .map(entry -> toTuple(entry._2))
      .keyBy(tuple -> tuple._1)
      .mapValues(tuple -> tuple._2)
      .reduceByKey((count1, count2) -> count1 + count2)
      .cache();

  // First write to GCS.
  wordCount.
      mapToPair(tuple -> new Tuple2<>(new Text(tuple._1), new LongWritable(tuple._2))).
      saveAsNewAPIHadoopFile(
          gcsOutputPath.toString(), Text.class, LongWritable.class, TextOutputFormat.class);
}
 
Example 3
Source File: WordCount.java    From pravega-samples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
        String[] remainingArgs = optionParser.getRemainingArgs();

        if (remainingArgs.length != 3) {
            System.err.println("Usage: WordCount <url> <scope> <stream>");
            System.exit(2);
        }

        conf.setStrings(PravegaConfig.INPUT_URI_STRING, remainingArgs[0]);
        conf.setStrings(PravegaConfig.INPUT_SCOPE_NAME, remainingArgs[1]);
        conf.setStrings(PravegaConfig.INPUT_STREAM_NAME, remainingArgs[2]);
        conf.setStrings(PravegaConfig.INPUT_DESERIALIZER, TextSerializer.class.getName());

        JavaSparkContext sc = new JavaSparkContext(new SparkConf());

        JavaPairRDD<EventKey, Text> lines = sc.newAPIHadoopRDD(conf, PravegaInputFormat.class, EventKey.class, Text.class);
        JavaRDD<String> words = lines.map(x -> x._2).flatMap(s -> Arrays.asList(SPACE.split(s.toString())).iterator());
        JavaPairRDD<String, Integer> ones = words.mapToPair(s -> new Tuple2<>(s, 1));
        JavaPairRDD<String, Integer> counts = ones.reduceByKey((i1, i2) -> i1 + i2);

        System.out.println("RESULT :" + counts.collect());
    }
 
Example 4
Source File: SparkStorageUtils.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) {
    Configuration c = new Configuration();
    c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
    JavaPairRDD<LongWritable, RecordWritable> pairRDD =
                    sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class);

    return pairRDD.mapToPair(new RecordLoadPairFunction());
}
 
Example 5
Source File: SparkStorageUtils.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Restore a {@code JavaPairRDD<Long,List<List<Writable>>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFileSequences(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<List<Writable>>> restoreMapFileSequences(String path, JavaSparkContext sc) {
    Configuration c = new Configuration();
    c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
    JavaPairRDD<LongWritable, SequenceRecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class,
                    LongWritable.class, SequenceRecordWritable.class);

    return pairRDD.mapToPair(new SequenceRecordLoadPairFunction());
}
 
Example 6
Source File: SparkStorageUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) {
    Configuration c = new Configuration();
    c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
    JavaPairRDD<LongWritable, RecordWritable> pairRDD =
                    sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class);

    return pairRDD.mapToPair(new RecordLoadPairFunction());
}
 
Example 7
Source File: SparkStorageUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Restore a {@code JavaPairRDD<Long,List<List<Writable>>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFileSequences(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<List<Writable>>> restoreMapFileSequences(String path, JavaSparkContext sc) {
    Configuration c = new Configuration();
    c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
    JavaPairRDD<LongWritable, SequenceRecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class,
                    LongWritable.class, SequenceRecordWritable.class);

    return pairRDD.mapToPair(new SequenceRecordLoadPairFunction());
}
 
Example 8
Source File: CompactionJob.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public Void call() throws Exception {
    if(!status.markRunning()){
        //the client has already cancelled us or has died before we could get started, so stop now
        return null;
    }
    int order = concurrentCompactions.incrementAndGet();
    try {
        int maxConcurrentCompactions = HConfiguration.getConfiguration().getOlapCompactionMaximumConcurrent();
        if (order > maxConcurrentCompactions) {
            status.markCompleted(new FailedOlapResult(
                    new CancellationException("Maximum number of concurrent compactions already running")));
            return null;
        }
        
        initializeJob();
        Configuration conf = new Configuration(HConfiguration.unwrapDelegate());
        if (LOG.isTraceEnabled()) {
            LOG.trace("regionLocation = " + compactionRequest.regionLocation);
        }
        conf.set(MRConstants.REGION_LOCATION, compactionRequest.regionLocation);
        conf.set(MRConstants.COMPACTION_FILES, getCompactionFilesBase64String());

        SpliceSpark.pushScope(compactionRequest.scope + ": Parallelize");
        //JavaRDD rdd1 = SpliceSpark.getContext().parallelize(files, 1);
        //ParallelCollectionRDD rdd1 = getCompactionRDD();

        JavaSparkContext context = SpliceSpark.getContext();
        JavaPairRDD<Integer, Iterator> rdd1 = context.newAPIHadoopRDD(conf,
                CompactionInputFormat.class,
                Integer.class,
                Iterator.class);
        rdd1.setName("Distribute Compaction Load");
        SpliceSpark.popScope();

        compactionRequest.compactionFunction.setContext(new SparkCompactionContext());
        SpliceSpark.pushScope(compactionRequest.scope + ": Compact files");
        JavaRDD<String> rdd2 = rdd1.mapPartitions(new SparkFlatMapFunction<>(compactionRequest.compactionFunction));
        rdd2.setName(compactionRequest.jobDetails);
        SpliceSpark.popScope();

        SpliceSpark.pushScope("Compaction");
        if (!status.isRunning()) {
            //the client timed out during our setup, so it's time to stop
            return null;
        }
        long startTime = clock.currentTimeMillis();
        JavaFutureAction<List<String>> collectFuture = rdd2.collectAsync();
        while (!collectFuture.isDone()) {
            try {
                collectFuture.get(tickTime, TimeUnit.MILLISECONDS);
            } catch (TimeoutException te) {
                /*
                 * A TimeoutException just means that tickTime expired. That's okay, we just stick our
                 * head up and make sure that the client is still operating
                 */
            }
            if (!status.isRunning()) {
                /*
                 * The client timed out, so cancel the compaction and terminate
                 */
                collectFuture.cancel(true);
                context.cancelJobGroup(compactionRequest.jobGroup);
                return null;
            }
            if (clock.currentTimeMillis() - startTime > compactionRequest.maxWait) {
                // Make sure compaction is scheduled in Spark and running, otherwise cancel it and fallback to in-HBase compaction
                if (!compactionRunning(collectFuture.jobIds())) {
                    collectFuture.cancel(true);
                    context.cancelJobGroup(compactionRequest.jobGroup);
                    status.markCompleted(new FailedOlapResult(
                            new RejectedExecutionException("No resources available for running compaction in Spark")));
                    return null;
                }
            }
        }
        //the compaction completed
        List<String> sPaths = collectFuture.get();
        status.markCompleted(new CompactionResult(sPaths));
        SpliceSpark.popScope();

        if (LOG.isTraceEnabled())
            SpliceLogUtils.trace(LOG, "Paths Returned: %s", sPaths);
        return null;
    } finally {
        concurrentCompactions.decrementAndGet();
    }
}