Java Code Examples for org.apache.spark.api.java.JavaPairRDD#reduceByKey()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#reduceByKey() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: WordCount.java From pravega-samples with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
        String[] remainingArgs = optionParser.getRemainingArgs();

        if (remainingArgs.length != 3) {
            System.err.println("Usage: WordCount <url> <scope> <stream>");
            System.exit(2);
        }

        conf.setStrings(PravegaConfig.INPUT_URI_STRING, remainingArgs[0]);
        conf.setStrings(PravegaConfig.INPUT_SCOPE_NAME, remainingArgs[1]);
        conf.setStrings(PravegaConfig.INPUT_STREAM_NAME, remainingArgs[2]);
        conf.setStrings(PravegaConfig.INPUT_DESERIALIZER, TextSerializer.class.getName());

        JavaSparkContext sc = new JavaSparkContext(new SparkConf());

        JavaPairRDD<EventKey, Text> lines = sc.newAPIHadoopRDD(conf, PravegaInputFormat.class, EventKey.class, Text.class);
        JavaRDD<String> words = lines.map(x -> x._2).flatMap(s -> Arrays.asList(SPACE.split(s.toString())).iterator());
        JavaPairRDD<String, Integer> ones = words.mapToPair(s -> new Tuple2<>(s, 1));
        JavaPairRDD<String, Integer> counts = ones.reduceByKey((i1, i2) -> i1 + i2);

        System.out.println("RESULT :" + counts.collect());
    }

Example 2

Source File: AnalyzeSpark.java From DataVec with Apache License 2.0

6 votes

/**
 * Sample the N most frequently occurring values in the specified column
 *
 * @param nMostFrequent    Top N values to sample
 * @param columnName       Name of the column to sample from
 * @param schema           Schema of the data
 * @param data             RDD containing the data
 * @return                 List of the most frequently occurring Writable objects in that column, along with their counts
 */
public static Map<Writable, Long> sampleMostFrequentFromColumn(int nMostFrequent, String columnName, Schema schema,
                JavaRDD<List<Writable>> data) {
    int columnIdx = schema.getIndexOfColumn(columnName);

    JavaPairRDD<Writable, Long> keyedByWritable = data.mapToPair(new ColumnToKeyPairTransform(columnIdx));
    JavaPairRDD<Writable, Long> reducedByWritable = keyedByWritable.reduceByKey(new SumLongsFunction2());

    List<Tuple2<Writable, Long>> list =
                    reducedByWritable.takeOrdered(nMostFrequent, new Tuple2Comparator<Writable>(false));

    List<Tuple2<Writable, Long>> sorted = new ArrayList<>(list);
    Collections.sort(sorted, new Tuple2Comparator<Writable>(false));

    Map<Writable, Long> map = new LinkedHashMap<>();
    for (Tuple2<Writable, Long> t2 : sorted) {
        map.put(t2._1(), t2._2());
    }

    return map;
}

Example 3

Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Sample the N most frequently occurring values in the specified column
 *
 * @param nMostFrequent    Top N values to sample
 * @param columnName       Name of the column to sample from
 * @param schema           Schema of the data
 * @param data             RDD containing the data
 * @return                 List of the most frequently occurring Writable objects in that column, along with their counts
 */
public static Map<Writable, Long> sampleMostFrequentFromColumn(int nMostFrequent, String columnName, Schema schema,
                JavaRDD<List<Writable>> data) {
    int columnIdx = schema.getIndexOfColumn(columnName);

    JavaPairRDD<Writable, Long> keyedByWritable = data.mapToPair(new ColumnToKeyPairTransform(columnIdx));
    JavaPairRDD<Writable, Long> reducedByWritable = keyedByWritable.reduceByKey(new SumLongsFunction2());

    List<Tuple2<Writable, Long>> list =
                    reducedByWritable.takeOrdered(nMostFrequent, new Tuple2Comparator<Writable>(false));

    List<Tuple2<Writable, Long>> sorted = new ArrayList<>(list);
    Collections.sort(sorted, new Tuple2Comparator<Writable>(false));

    Map<Writable, Long> map = new LinkedHashMap<>();
    for (Tuple2<Writable, Long> t2 : sorted) {
        map.put(t2._1(), t2._2());
    }

    return map;
}

Example 4

Source File: WordCount.java From tutorials with MIT License

6 votes

public static void main(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage: JavaWordCount <file>");
        System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount")
        .setMaster("local");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());
    JavaPairRDD<String, Integer> wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1));
    JavaPairRDD<String, Integer> wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2);
    List<Tuple2<String, Integer>> output = wordWithCount.collect();
    for (Tuple2<?, ?> tuple : output) {
         System.out.println(tuple._1() + ": " + tuple._2());
    }
    ctx.stop();
}

Example 5

Source File: JavaLogQuery.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogQuery")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

  JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() {
    @Override
    public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
      return new Tuple2<>(extractKey(s), extractStats(s));
    }
  });

  JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() {
    @Override
    public Stats call(Stats stats, Stats stats2) {
      return stats.merge(stats2);
    }
  });

  List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
  for (Tuple2<?,?> t : output) {
    System.out.println(t._1() + "\t" + t._2());
  }
  spark.stop();
}

Example 6

Source File: ComputeResponse.java From incubator-retired-pirk with Apache License 2.0

5 votes

private void encryptedColumnCalc(JavaPairRDD<Long,BigInteger> encRowRDD) throws PIRException
{
  // Multiply the column values by colNum: emit <colNum, finalColVal>
  JavaPairRDD<Long,BigInteger> encColRDD;
  if (colMultReduceByKey)
  {
    encColRDD = encRowRDD.reduceByKey(new EncColMultReducer(bVars), numColMultPartitions);
  }
  else
  {
    encColRDD = encRowRDD.groupByKey(numColMultPartitions).mapToPair(new EncColMultGroupedMapper(bVars));
  }

  // Form the final response object
  Response response = new Response(queryInfo);
  Map<Long,BigInteger> encColResults = encColRDD.collectAsMap();
  logger.debug("encColResults.size() = " + encColResults.size());

  for (Entry<Long,BigInteger> entry : encColResults.entrySet())
  {
    int colVal = entry.getKey().intValue();
    response.addElement(colVal, entry.getValue());
    logger.debug("colNum = " + colVal + " column = " + entry.getValue().toString());
  }

  try
  {
    storage.store(outputFile, response);
  } catch (IOException e)
  {
    throw new RuntimeException(e);
  }
  accum.printAll();
}

Example 7

Source File: CountLines.java From examples with Apache License 2.0

5 votes

@SuppressWarnings("serial")
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample ").setMaster("local[2]");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  JavaRDD<String> textFile = jsc.textFile("hdfs://localhost/user/cloudera/data.txt");
  JavaPairRDD<String, Integer> pairs = textFile.mapToPair(new PairFunction<String, String, Integer>() {
    public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s.substring(0, s.indexOf("|")), 1); }
  });
  JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
    public Integer call(Integer a, Integer b) { return a + b; }
  });
  System.out.println ("We have generaged " + counts.count() + " users");
  jsc.close();
}

Example 8

Source File: WordCountJava.java From BigDataArchitect with Apache License 2.0

4 votes

public static void main(String[] args) throws FileNotFoundException {

        SparkConf conf = new SparkConf();
        conf.setAppName("java-wordcount");
        conf.setMaster("local");

        JavaSparkContext jsc = new JavaSparkContext(conf);

        JavaRDD<String> fileRDD = jsc.textFile("bigdata-spark/data/testdata.txt");

        JavaRDD<String> words = fileRDD.flatMap(new FlatMapFunction<String, String>() {
            public Iterator<String> call(String line) throws Exception {
                return Arrays.asList(line.split(" ")).iterator();
            }
        });

        JavaPairRDD<String, Integer> pairWord = words.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String word) throws Exception {
                return new Tuple2<String, Integer>(word, 1);
            }
        });

        JavaPairRDD<String, Integer> res = pairWord.reduceByKey(new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer oldV, Integer v) throws Exception {
                return oldV + v;
            }
        });

        res.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            public void call(Tuple2<String, Integer> value) throws Exception {
                System.out.println(value._1+"\t"+value._2);
            }
        });

//
//        RandomAccessFile rfile = new RandomAccessFile("ooxx","rw");
//
////        rfile.seek(222);
//        FileChannel channel = rfile.getChannel();
//        //  linux  fd   write(fd)  read(fd)
//
//
//        ByteBuffer b1 = ByteBuffer.allocate(1024);
//        ByteBuffer b2 = ByteBuffer.allocateDirect(1024);
//        MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_WRITE, 80, 120);
//


    }

Example 9

Source File: ReduceByKey.java From SparkDemo with MIT License

4 votes

/**
 * @category 统计文本单词个数
 * @param sc
 */
private static void reduceByKey(JavaSparkContext sc) {
	JavaRDD<String> lines = sc.textFile(Constant.LOCAL_FILE_PREX +"README.md");

	/**
	 *  ====================================================================================================== 
	 *   |                                                                     根据' '分词 扁平化输出Flatten output according to space word segmentation                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<String> wordsRDD = lines.flatMap(new FlatMapFunction<String, String>() {

		private static final long serialVersionUID = 1L;

		public Iterator<String> call(String line) throws Exception {
			List<String> words = Arrays.asList(line.split(" "));
			return words.iterator();
		}
	});

	/**
	 *  ====================================================================================================== 
	 *   |                                                                   将单词转换word:1的元组格式Converts the word to the tuple format of word:1                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaPairRDD<String, Integer> wordsCount = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {

		private static final long serialVersionUID = 1L;

		public Tuple2<String, Integer> call(String word) throws Exception {
			return new Tuple2<String, Integer>(word, 1);
		}
	});

	/**
	 *  ========================================================================================================= 
	 *   |                                                                根据元组Key(也就是单词)来分组Grouping according to the tuple Key (that is, the word)                                                                    | 
	 *   ========================================================================================================= 
	 */
	JavaPairRDD<String, Integer> resultRDD = wordsCount.reduceByKey(new Function2<Integer, Integer, Integer>() {

		private static final long serialVersionUID = 1L;

		public Integer call(Integer v1, Integer v2) throws Exception {
			return v1 + v2;
		}
	});

	resultRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {

		private static final long serialVersionUID = 1L;

		public void call(Tuple2<String, Integer> t) throws Exception {
			System.out.println(t._1 + "\t" + t._2());
		}
	});

	sc.close();
}

Example 10

Source File: Basic.java From learning-spark-with-java with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}

Example 11

Source File: TestSpark.java From kite with Apache License 2.0

4 votes

@Test
@SuppressWarnings("deprecation")
public void testSparkJob() throws Exception {
  Dataset<Record> inputDataset = repo.create("ns", "in",
      new DatasetDescriptor.Builder()
        .property("kite.allow.csv", "true")
        .schema(TestMapReduce.STRING_SCHEMA)
        .format(format)
        .build(), Record.class);
  DatasetWriter<Record> writer = inputDataset.newWriter();
  writer.write(newStringRecord("apple"));
  writer.write(newStringRecord("banana"));
  writer.write(newStringRecord("banana"));
  writer.write(newStringRecord("carrot"));
  writer.write(newStringRecord("apple"));
  writer.write(newStringRecord("apple"));
  writer.close();


  Dataset<Record> outputDataset = repo.create("ns", "out",
      new DatasetDescriptor.Builder()
        .property("kite.allow.csv", "true")
        .schema(TestMapReduce.STATS_SCHEMA)
        .format(format)
        .build(), Record.class);

  Job job = Job.getInstance();
  DatasetKeyInputFormat.configure(job).readFrom(inputDataset);
  DatasetKeyOutputFormat.configure(job).writeTo(outputDataset);

  @SuppressWarnings("unchecked")
  JavaPairRDD<Record, Void> inputData = SparkTestHelper.getSparkContext()
      .newAPIHadoopRDD(job.getConfiguration(), DatasetKeyInputFormat.class,
          Record.class, Void.class);

  JavaPairRDD<String, Integer> mappedData = inputData.mapToPair(new ToJava());
  JavaPairRDD<String, Integer> sums = mappedData.reduceByKey(new Sum());
  JavaPairRDD<Record, Void> outputData = sums.mapToPair(new ToAvro());

  outputData.saveAsNewAPIHadoopDataset(job.getConfiguration());

  DatasetReader<Record> reader = outputDataset.newReader();
  Map<String, Integer> counts = new HashMap<String, Integer>();
  for (Record record : reader) {
    counts.put(record.get("name").toString(), (Integer) record.get("count"));
  }
  reader.close();

  Assert.assertEquals(3, counts.get("apple").intValue());
  Assert.assertEquals(2, counts.get("banana").intValue());
  Assert.assertEquals(1, counts.get("carrot").intValue());

}