Java Code Examples for org.apache.spark.api.java.JavaRDD#sortBy()

The following examples show how to use org.apache.spark.api.java.JavaRDD#sortBy() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RDDSortUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<Double> dvals = in.values()
			.flatMap(new ExtractDoubleValuesFunction());

	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
	JavaRDD<Double> sdvals = dvals
			.sortBy(new CreateDoubleKeyFunction(), true, numPartitions);
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
			.zipWithIndex()
			.mapPartitionsToPair(new ConvertToBinaryBlockFunction(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}
 
Example 2
Source File: RDDSortUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> in2, long rlen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<DoublePair> dvals = in.join(in2).values()
		.flatMap(new ExtractDoubleValuesFunction2());

	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
	JavaRDD<DoublePair> sdvals = dvals
		.sortBy(new CreateDoubleKeyFunction2(), true, numPartitions);

	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction2(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);		
	
	return ret;
}
 
Example 3
Source File: RDDSortUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVals(
	JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, long clen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<MatrixBlock> dvals = in.values()
		.flatMap(new ExtractRowsFunction());
	
	//sort (creates sorted range per partition)
	int numPartitions = SparkUtils.getNumPreferredPartitions(
		new MatrixCharacteristics(rlen, clen, blen, blen), in);
	JavaRDD<MatrixBlock> sdvals = dvals
		.sortBy(new CreateDoubleKeysFunction(), true, numPartitions);
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction5(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}
 
Example 4
Source File: HoodieBloomIndex.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Find out <RowKey, filename> pair. All workload grouped by file-level.
 * <p>
 * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
 * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
 * <p>
 * Make sure the parallelism is atleast the groupby parallelism for tagging location
 */
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
    final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
    JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTable hoodieTable,
    Map<String, Long> fileGroupToComparisons) {
  JavaRDD<Tuple2<String, HoodieKey>> fileComparisonsRDD =
      explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);

  if (config.useBloomIndexBucketizedChecking()) {
    Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
        config.getBloomIndexKeysPerBucket());

    fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
        .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
  } else {
    fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
  }

  return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
      .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
      .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
          .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
              new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
          .collect(Collectors.toList()).iterator());
}
 
Example 5
Source File: RDDSortUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<Double> dvals = in.values()
			.flatMap(new ExtractDoubleValuesFunction());

	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
	JavaRDD<Double> sdvals = dvals
			.sortBy(new CreateDoubleKeyFunction(), true, numPartitions);
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
			.zipWithIndex()
			.mapPartitionsToPair(new ConvertToBinaryBlockFunction(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}
 
Example 6
Source File: RDDSortUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> in2, long rlen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<DoublePair> dvals = in.join(in2).values()
		.flatMap(new ExtractDoubleValuesFunction2());

	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
	JavaRDD<DoublePair> sdvals = dvals
		.sortBy(new CreateDoubleKeyFunction2(), true, numPartitions);

	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction2(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);		
	
	return ret;
}
 
Example 7
Source File: RDDSortUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVals(
	JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, long clen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<MatrixBlock> dvals = in.values()
		.flatMap(new ExtractRowsFunction());
	
	//sort (creates sorted range per partition)
	int numPartitions = SparkUtils.getNumPreferredPartitions(
		new MatrixCharacteristics(rlen, clen, blen, blen), in);
	JavaRDD<MatrixBlock> sdvals = dvals
		.sortBy(new CreateDoubleKeysFunction(), true, numPartitions);
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction5(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}
 
Example 8
Source File: ALSUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData,
                                                                boolean knownItems) {
  JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size());

  JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> {
      String user = datum[0];
      String item = datum[1];
      Boolean delete = datum[2].isEmpty();
      return knownItems ?
          new Tuple2<>(user, new Tuple2<>(item, delete)) :
          new Tuple2<>(item, new Tuple2<>(user, delete));
    });

  // TODO likely need to figure out a way to avoid groupByKey but collectByKey
  // won't work here -- doesn't guarantee enough about ordering
  return tuples.groupByKey().mapValues(idDeletes -> {
      Collection<String> ids = new HashSet<>();
      for (Tuple2<String,Boolean> idDelete : idDeletes) {
        if (idDelete._2()) {
          ids.remove(idDelete._1());
        } else {
          ids.add(idDelete._1());
        }
      }
      return ids;
    });
}
 
Example 9
Source File: BulkInsertHelper.java    From hudi with Apache License 2.0 4 votes vote down vote up
public static <T extends HoodieRecordPayload<T>> HoodieWriteMetadata bulkInsert(
    JavaRDD<HoodieRecord<T>> inputRecords, String instantTime,
    HoodieTable<T> table, HoodieWriteConfig config,
    CommitActionExecutor<T> executor, boolean performDedupe,
    Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
  HoodieWriteMetadata result = new HoodieWriteMetadata();

  // De-dupe/merge if needed
  JavaRDD<HoodieRecord<T>> dedupedRecords = inputRecords;

  if (performDedupe) {
    dedupedRecords = WriteHelper.combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords,
        config.getInsertShuffleParallelism(), ((HoodieTable<T>)table));
  }

  final JavaRDD<HoodieRecord<T>> repartitionedRecords;
  final int parallelism = config.getBulkInsertShuffleParallelism();
  if (bulkInsertPartitioner.isPresent()) {
    repartitionedRecords = bulkInsertPartitioner.get().repartitionRecords(dedupedRecords, parallelism);
  } else {
    // Now, sort the records and line them up nicely for loading.
    repartitionedRecords = dedupedRecords.sortBy(record -> {
      // Let's use "partitionPath + key" as the sort key. Spark, will ensure
      // the records split evenly across RDD partitions, such that small partitions fit
      // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
      return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
    }, true, parallelism);
  }

  // generate new file ID prefixes for each output partition
  final List<String> fileIDPrefixes =
      IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());

  table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED,
      table.getMetaClient().getCommitActionType(), instantTime), Option.empty(),
      config.shouldAllowMultiWriteOnSameInstant());

  JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
      .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(instantTime, config, table, fileIDPrefixes), true)
      .flatMap(List::iterator);

  executor.updateIndexAndCommitIfNeeded(writeStatusRDD, result);
  return result;
}