Java Code Examples for org.apache.spark.api.java.JavaPairRDD#groupByKey()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#groupByKey() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataStep.java    From envelope with Apache License 2.0 6 votes vote down vote up
private JavaRDD<Row> planMutationsByKey(Dataset<Row> arriving, List<String> keyFieldNames,
                                        Config plannerConfig, Config outputConfig) {
  JavaPairRDD<Row, Row> keyedArriving = 
      arriving.javaRDD().keyBy(new ExtractKeyFunction(keyFieldNames, accumulators));

  JavaPairRDD<Row, Iterable<Row>> arrivingByKey = 
      keyedArriving.groupByKey(getPartitioner(keyedArriving));

  JavaPairRDD<Row, Tuple2<Iterable<Row>, Iterable<Row>>> arrivingAndExistingByKey =
      arrivingByKey.mapPartitionsToPair(new JoinExistingForKeysFunction(outputConfig, keyFieldNames, accumulators));

  JavaRDD<Row> planned = 
      arrivingAndExistingByKey.flatMap(new PlanForKeyFunction(plannerConfig, accumulators));

  return planned;
}
 
Example 2
Source File: GroupCombineFunctions.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * An implementation of {@link
 * org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly} for the Spark runner.
 */
public static <K, V> JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupByKeyOnly(
    JavaRDD<WindowedValue<KV<K, V>>> rdd,
    Coder<K> keyCoder,
    WindowedValueCoder<V> wvCoder,
    @Nullable Partitioner partitioner) {
  // we use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle.
  JavaPairRDD<ByteArray, byte[]> pairRDD =
      rdd.map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));

  // If no partitioner is passed, the default group by key operation is called
  JavaPairRDD<ByteArray, Iterable<byte[]>> groupedRDD =
      (partitioner != null) ? pairRDD.groupByKey(partitioner) : pairRDD.groupByKey();

  return groupedRDD
      .mapToPair(CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder))
      .map(new TranslationUtils.FromPairFunction<>());
}
 
Example 3
Source File: RankConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank)
		throws IOException {
	SparkUtil.assertPredecessorSize(predecessors, poRank, 1);
       RDD<Tuple> rdd = predecessors.get(0);
	JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD()
			.mapToPair(new ToPairRdd());
	JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd
			.groupByKey();
	JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex
			.mapToPair(new IndexCounters());
	JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex
			.sortByKey(true);
	Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap();
	JavaRDD<Tuple> finalRdd = rdd.toJavaRDD()
			.map(new RankFunction(new HashMap<Integer, Long>(counts)));
	return finalRdd.rdd();
}
 
Example 4
Source File: HaplotypeCallerSpark.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Create an RDD of {@link Shard} from an RDD of {@link GATKRead}
 * @param shardBoundariesBroadcast  broadcast of an {@link OverlapDetector} loaded with the intervals that should be used for creating ReadShards
 * @param reads Rdd of {@link GATKRead}
 * @return a Rdd of reads grouped into potentially overlapping shards
 */
private static JavaRDD<Shard<GATKRead>> createReadShards(final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast, final JavaRDD<GATKRead> reads) {
    final JavaPairRDD<ShardBoundary, GATKRead> paired = reads.flatMapToPair(read -> {
        final Collection<ShardBoundary> overlappingShards = shardBoundariesBroadcast.value().getOverlaps(read);
        return overlappingShards.stream().map(key -> new Tuple2<>(key, read)).iterator();
    });
    final JavaPairRDD<ShardBoundary, Iterable<GATKRead>> shardsWithReads = paired.groupByKey();
    return shardsWithReads.map(shard -> new SparkReadShard(shard._1(), shard._2()));
}
 
Example 5
Source File: ProcessedOffsetManager.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static <T> void persistsPartition(JavaRDD<MessageAndMetadata<T>> rdd, Properties props) throws Exception {
      JavaPairRDD<String,Long> partitionOffsetRdd = rdd.mapPartitionsToPair(new PartitionOffsetPair<>());
      JavaPairRDD<String, Iterable<Long>> partitonOffset = partitionOffsetRdd.groupByKey(1);
      List<Tuple2<String, Iterable<Long>>> poList = partitonOffset.collect();
      doPersists(poList, props);
}
 
Example 6
Source File: PageOneStepConvertRateSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
public static void main(String[] args) {
	// 1、构造Spark上下文
	SparkConf conf = new SparkConf()
			.setAppName(Constants.SPARK_APP_NAME_PAGE);
	SparkUtils.setMaster(conf);
	
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = SparkUtils.getSQLContext(sc.sc());
	
	// 2、生成模拟数据
	SparkUtils.mockData(sc, sqlContext);  
	
	// 3、查询任务,获取任务的参数
	Long taskid = ParamUtils.getTaskIdFromArgs(args, Constants.SPARK_LOCAL_TASKID_PAGE);
	
	ITaskDAO taskDAO = DAOFactory.getTaskDAO();
	Task task = taskDAO.findById(taskid);
	if(task == null) {
		System.out.println(new Date() + ": cannot find this task with id [" + taskid + "].");  
		return;
	}
	
	JSONObject taskParam = JSONObject.parseObject(task.getTaskParam());
	
	// 4、查询指定日期范围内的用户访问行为数据
	JavaRDD<Row> actionRDD = SparkUtils.getActionRDDByDateRange(
			sqlContext, taskParam);
	
	// 对用户访问行为数据做一个映射,将其映射为<sessionid,访问行为>的格式
	// 咱们的用户访问页面切片的生成,是要基于每个session的访问数据,来进行生成的
	// 脱离了session,生成的页面访问切片,是没有意义的
	// 举例,比如用户A,访问了页面3和页面5
	// 用于B,访问了页面4和页面6
	// 漏了一个前提,使用者指定的页面流筛选条件,比如页面3->页面4->页面7
	// 你能不能说,是将页面3->页面4,串起来,作为一个页面切片,来进行统计呢
	// 当然不行
	// 所以说呢,页面切片的生成,肯定是要基于用户session粒度的
	
	JavaPairRDD<String, Row> sessionid2actionRDD = getSessionid2actionRDD(actionRDD);
	sessionid2actionRDD = sessionid2actionRDD.cache(); // persist(StorageLevel.MEMORY_ONLY)
	
	// 对<sessionid,访问行为> RDD,做一次groupByKey操作
	// 因为我们要拿到每个session对应的访问行为数据,才能够去生成切片
	JavaPairRDD<String, Iterable<Row>> sessionid2actionsRDD = sessionid2actionRDD.groupByKey();
	
	// 最核心的一步,每个session的单跳页面切片的生成,以及页面流的匹配,算法
	JavaPairRDD<String, Integer> pageSplitRDD = generateAndMatchPageSplit(
			sc, sessionid2actionsRDD, taskParam);
	Map<String, Long> pageSplitPvMap = pageSplitRDD.countByKey();

	// 使用者指定的页面流是3,2,5,8,6
	// 咱们现在拿到的这个pageSplitPvMap,3->2,2->5,5->8,8->6
	Long startPagePv = getStartPagePv(taskParam, sessionid2actionsRDD);
	
	// 计算目标页面流的各个页面切片的转化率
	Map<String, Double> convertRateMap = computePageSplitConvertRate(
			taskParam, pageSplitPvMap, startPagePv);
	
	// 持久化页面切片转化率
	persistConvertRate(taskid, convertRateMap);  
}