org.apache.spark.Partitioner Java Examples

The following examples show how to use org.apache.spark.Partitioner. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: MatrixIndexingSPInstruction.java From systemds with Apache License 2.0

6 votes

/**
 * Wraps the input RDD into a PartitionPruningRDD, which acts as a filter
 * of required partitions. The distinct set of required partitions is determined
 * via the partitioner of the input RDD.
 * 
 * @param in input matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param filter partition filter
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
private static JavaPairRDD<MatrixIndexes,MatrixBlock> createPartitionPruningRDD( 
		JavaPairRDD<MatrixIndexes,MatrixBlock> in, List<MatrixIndexes> filter )
{
	//build hashset of required partition ids
	HashSet<Integer> flags = new HashSet<>();
	Partitioner partitioner = in.rdd().partitioner().get();
	for( MatrixIndexes key : filter )
		flags.add(partitioner.getPartition(key));

	//create partition pruning rdd
	Function1<Object,Object> f = new PartitionPruningFunction(flags);
	PartitionPruningRDD<Tuple2<MatrixIndexes, MatrixBlock>> ppRDD = 
			PartitionPruningRDD.create(in.rdd(), f);

	//wrap output into java pair rdd
	return new JavaPairRDD<>(ppRDD, 
			ClassManifestFactory.fromClass(MatrixIndexes.class), 
			ClassManifestFactory.fromClass(MatrixBlock.class));
}

Example #2

Source File: MatrixIndexingSPInstruction.java From systemds with Apache License 2.0

6 votes

/**
 * Wraps the input RDD into a PartitionPruningRDD, which acts as a filter
 * of required partitions. The distinct set of required partitions is determined
 * via the partitioner of the input RDD.
 * 
 * @param in input matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param filter partition filter
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
private static JavaPairRDD<MatrixIndexes,MatrixBlock> createPartitionPruningRDD( 
		JavaPairRDD<MatrixIndexes,MatrixBlock> in, List<MatrixIndexes> filter )
{
	//build hashset of required partition ids
	HashSet<Integer> flags = new HashSet<>();
	Partitioner partitioner = in.rdd().partitioner().get();
	for( MatrixIndexes key : filter )
		flags.add(partitioner.getPartition(key));

	//create partition pruning rdd
	Function1<Object,Object> f = new PartitionPruningFunction(flags);
	PartitionPruningRDD<Tuple2<MatrixIndexes, MatrixBlock>> ppRDD = 
			PartitionPruningRDD.create(in.rdd(), f);

	//wrap output into java pair rdd
	return new JavaPairRDD<>(ppRDD, 
			ClassManifestFactory.fromClass(MatrixIndexes.class), 
			ClassManifestFactory.fromClass(MatrixBlock.class));
}

Example #3

Source File: GroupNonMergingWindowsFunctions.java From beam with Apache License 2.0

6 votes

/**
 * Creates composite key of K and W and group all values for that composite key with Spark's
 * repartitionAndSortWithinPartitions. Stream of sorted by composite key's is transformed to key
 * with iterator of all values for that key (via {@link GroupByKeyIterator}).
 *
 * <p>repartitionAndSortWithinPartitions is used because all values are not collected into memory
 * at once, but streamed with iterator unlike GroupByKey (it minimizes memory pressure).
 */
static <K, V, W extends BoundedWindow>
    JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupByKeyAndWindow(
        JavaRDD<WindowedValue<KV<K, V>>> rdd,
        Coder<K> keyCoder,
        Coder<V> valueCoder,
        WindowingStrategy<?, W> windowingStrategy,
        Partitioner partitioner) {
  final Coder<W> windowCoder = windowingStrategy.getWindowFn().windowCoder();
  FullWindowedValueCoder<KV<K, V>> windowedKvCoder =
      WindowedValue.FullWindowedValueCoder.of(KvCoder.of(keyCoder, valueCoder), windowCoder);
  JavaPairRDD<ByteArray, byte[]> windowInKey =
      bringWindowToKey(
          rdd, keyCoder, windowCoder, wv -> CoderHelpers.toByteArray(wv, windowedKvCoder));
  return windowInKey
      .repartitionAndSortWithinPartitions(getPartitioner(partitioner, rdd))
      .mapPartitions(
          it -> new GroupByKeyIterator<>(it, keyCoder, windowingStrategy, windowedKvCoder))
      .filter(Objects::nonNull); // filter last null element from GroupByKeyIterator
}

Example #4

Source File: GroupCombineFunctions.java From beam with Apache License 2.0

6 votes

/**
 * An implementation of {@link
 * org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly} for the Spark runner.
 */
public static <K, V> JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupByKeyOnly(
    JavaRDD<WindowedValue<KV<K, V>>> rdd,
    Coder<K> keyCoder,
    WindowedValueCoder<V> wvCoder,
    @Nullable Partitioner partitioner) {
  // we use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle.
  JavaPairRDD<ByteArray, byte[]> pairRDD =
      rdd.map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));

  // If no partitioner is passed, the default group by key operation is called
  JavaPairRDD<ByteArray, Iterable<byte[]>> groupedRDD =
      (partitioner != null) ? pairRDD.groupByKey(partitioner) : pairRDD.groupByKey();

  return groupedRDD
      .mapToPair(CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder))
      .map(new TranslationUtils.FromPairFunction<>());
}

Example #5

Source File: BaseCommitActionExecutor.java From hudi with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
protected Iterator<List<WriteStatus>> handleUpsertPartition(String instantTime, Integer partition, Iterator recordItr,
    Partitioner partitioner) {
  UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
  BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
  BucketType btype = binfo.bucketType;
  try {
    if (btype.equals(BucketType.INSERT)) {
      return handleInsert(binfo.fileIdPrefix, recordItr);
    } else if (btype.equals(BucketType.UPDATE)) {
      return handleUpdate(binfo.partitionPath, binfo.fileIdPrefix, recordItr);
    } else {
      throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
    }
  } catch (Throwable t) {
    String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
    LOG.error(msg, t);
    throw new HoodieUpsertException(msg, t);
  }
}

Example #6

Source File: HoodieBloomIndex.java From hudi with Apache License 2.0

6 votes

/**
 * Find out <RowKey, filename> pair. All workload grouped by file-level.
 * <p>
 * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
 * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
 * <p>
 * Make sure the parallelism is atleast the groupby parallelism for tagging location
 */
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
    final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
    JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTable hoodieTable,
    Map<String, Long> fileGroupToComparisons) {
  JavaRDD<Tuple2<String, HoodieKey>> fileComparisonsRDD =
      explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);

  if (config.useBloomIndexBucketizedChecking()) {
    Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
        config.getBloomIndexKeysPerBucket());

    fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
        .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
  } else {
    fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
  }

  return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
      .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
      .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
          .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
              new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
          .collect(Collectors.toList()).iterator());
}

Example #7

Source File: TransformTranslator.java From beam with Apache License 2.0

5 votes

private static <K, V, OutputT> JavaPairRDD<TupleTag<?>, WindowedValue<?>> statefulParDoTransform(
    KvCoder<K, V> kvCoder,
    Coder<? extends BoundedWindow> windowCoder,
    JavaRDD<WindowedValue<KV<K, V>>> kvInRDD,
    Partitioner partitioner,
    MultiDoFnFunction<KV<K, V>, OutputT> doFnFunction,
    boolean requiresSortedInput) {
  Coder<K> keyCoder = kvCoder.getKeyCoder();

  final WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(kvCoder.getValueCoder(), windowCoder);

  if (!requiresSortedInput) {
    return GroupCombineFunctions.groupByKeyOnly(kvInRDD, keyCoder, wvCoder, partitioner)
        .map(
            input -> {
              final K key = input.getKey();
              Iterable<WindowedValue<V>> value = input.getValue();
              return FluentIterable.from(value)
                  .transform(
                      windowedValue ->
                          windowedValue.withValue(KV.of(key, windowedValue.getValue())))
                  .iterator();
            })
        .flatMapToPair(doFnFunction);
  }

  JavaPairRDD<ByteArray, byte[]> pairRDD =
      kvInRDD
          .map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(
              CoderHelpers.toByteFunctionWithTs(keyCoder, wvCoder, in -> in._2().getTimestamp()));

  JavaPairRDD<ByteArray, byte[]> sorted =
      pairRDD.repartitionAndSortWithinPartitions(keyPrefixPartitionerFrom(partitioner));

  return sorted.mapPartitionsToPair(wrapDoFnFromSortedRDD(doFnFunction, keyCoder, wvCoder));
}

Example #8

Source File: CommitActionExecutor.java From hudi with Apache License 2.0

5 votes

@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
  if (profile == null) {
    throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
  }
  return new UpsertPartitioner(profile, jsc, table, config);
}

Example #9

Source File: BaseCommitActionExecutor.java From hudi with Apache License 2.0

5 votes

public HoodieWriteMetadata execute(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
  HoodieWriteMetadata result = new HoodieWriteMetadata();
  // Cache the tagged records, so we don't end up computing both
  // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
  if (inputRecordsRDD.getStorageLevel() == StorageLevel.NONE()) {
    inputRecordsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  } else {
    LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel());
  }

  WorkloadProfile profile = null;
  if (isWorkloadProfileNeeded()) {
    profile = new WorkloadProfile(inputRecordsRDD);
    LOG.info("Workload profile :" + profile);
    saveWorkloadProfileMetadataToInflight(profile, instantTime);
  }

  // partition using the insert partitioner
  final Partitioner partitioner = getPartitioner(profile);
  JavaRDD<HoodieRecord<T>> partitionedRecords = partition(inputRecordsRDD, partitioner);
  JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {
    if (WriteOperationType.isChangingRecords(operationType)) {
      return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
    } else {
      return handleInsertPartition(instantTime, partition, recordItr, partitioner);
    }
  }, true).flatMap(List::iterator);

  updateIndexAndCommitIfNeeded(writeStatusRDD, result);
  return result;
}

Example #10

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <U> SparkJavaPairRDD<K, U> aggregateByKey(final U zeroValue,
                                                 final Partitioner partitioner,
                                                 final Function2<U, V, U> seqFunc,
                                                 final Function2<U, U, U> combFunc) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #11

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #12

Source File: DeltaCommitActionExecutor.java From hudi with Apache License 2.0

5 votes

@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
  if (profile == null) {
    throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
  }
  mergeOnReadUpsertPartitioner = new UpsertDeltaCommitPartitioner(profile, jsc, table, config);
  return mergeOnReadUpsertPartitioner;
}

Example #13

Source File: DataStep.java From envelope with Apache License 2.0

5 votes

private Partitioner getPartitioner(JavaPairRDD<Row, Row> keyedArriving) {
  Config partitionerConfig;
  
  if (hasPartitioner()) {
    partitionerConfig = config.getConfig(PARTITIONER_TYPE);
  }
  else {
    partitionerConfig = ConfigFactory.empty().withValue(
        PartitionerFactory.TYPE_CONFIG_NAME, ConfigValueFactory.fromAnyRef("range"));
  }
  
  return PartitionerFactory.create(partitionerConfig, keyedArriving);
}

Example #14

Source File: TestPartitionerFactory.java From envelope with Apache License 2.0

5 votes

@Test
public void testHash() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put("type", "hash");
  
  JavaPairRDD<Row, Row> base = getDummyRDD(10);
  Config config = ConfigFactory.parseMap(configMap);
  Partitioner p = PartitionerFactory.create(config, base);
  
  assertTrue(p instanceof HashPartitioner);
  assertEquals(p.numPartitions(), 10);
}

Example #15

Source File: TestPartitionerFactory.java From envelope with Apache License 2.0

5 votes

@Test
public void testRange() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put("type", "range");
  
  JavaPairRDD<Row, Row> base = getDummyRDD(10);
  Config config = ConfigFactory.parseMap(configMap);
  Partitioner p = PartitionerFactory.create(config, base);
  
  assertTrue(p instanceof RangePartitioner);
  assertEquals(p.numPartitions(), 10);
}

Example #16

Source File: BaseCommitActionExecutor.java From hudi with Apache License 2.0

5 votes

private Partitioner getPartitioner(WorkloadProfile profile) {
  if (WriteOperationType.isChangingRecords(operationType)) {
    return getUpsertPartitioner(profile);
  } else {
    return getInsertPartitioner(profile);
  }
}

Example #17

Source File: TransformTranslator.java From beam with Apache License 2.0

5 votes

private static Partitioner keyPrefixPartitionerFrom(Partitioner partitioner) {
  return new Partitioner() {
    @Override
    public int numPartitions() {
      return partitioner.numPartitions();
    }

    @Override
    public int getPartition(Object o) {
      ByteArray b = (ByteArray) o;
      return partitioner.getPartition(
          new ByteArray(Arrays.copyOfRange(b.getValue(), 0, b.getValue().length - 8)));
    }
  };
}

Example #18

Source File: TransformTranslator.java From beam with Apache License 2.0

5 votes

@Nullable
private static Partitioner getPartitioner(EvaluationContext context) {
  Long bundleSize =
      context.getSerializableOptions().get().as(SparkPipelineOptions.class).getBundleSize();
  return (bundleSize > 0)
      ? null
      : new HashPartitioner(context.getSparkContext().defaultParallelism());
}

Example #19

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner,
                                               final boolean mapSideCombine,
                                               final Serializer serializer) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #20

Source File: SparkBatchPortablePipelineTranslator.java From beam with Apache License 2.0

5 votes

private static <K, V> void translateGroupByKey(
    PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {

  RunnerApi.Components components = pipeline.getComponents();
  String inputId = getInputId(transformNode);
  Dataset inputDataset = context.popDataset(inputId);
  JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD();
  WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components);
  KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
  Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();
  Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
  WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components);
  WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
  WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder());

  JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow;
  Partitioner partitioner = getPartitioner(context);
  if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
    // we can have a memory sensitive translation for non-merging windows
    groupedByKeyAndWindow =
        GroupNonMergingWindowsFunctions.groupByKeyAndWindow(
            inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner);
  } else {
    JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly =
        GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner);
    // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
    groupedByKeyAndWindow =
        groupedByKeyOnly.flatMap(
            new SparkGroupAlsoByWindowViaOutputBufferFn<>(
                windowingStrategy,
                new TranslationUtils.InMemoryStateInternalsFactory<>(),
                SystemReduceFn.buffering(inputValueCoder),
                context.serializablePipelineOptions));
  }
  context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow));
}

Example #21

Source File: SparkBatchPortablePipelineTranslator.java From beam with Apache License 2.0

5 votes

@Nullable
private static Partitioner getPartitioner(SparkTranslationContext context) {
  Long bundleSize =
      context.serializablePipelineOptions.get().as(SparkPipelineOptions.class).getBundleSize();
  return (bundleSize > 0)
      ? null
      : new HashPartitioner(context.getSparkContext().defaultParallelism());
}

Example #22

Source File: ParamservUtils.java From systemds with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public static JavaPairRDD<Integer, Tuple2<MatrixBlock, MatrixBlock>> doPartitionOnSpark(SparkExecutionContext sec, MatrixObject features, MatrixObject labels, Statement.PSScheme scheme, int workerNum) {
	Timing tSetup = DMLScript.STATISTICS ? new Timing(true) : null;
	// Get input RDD
	JavaPairRDD<MatrixIndexes, MatrixBlock> featuresRDD = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		sec.getRDDHandleForMatrixObject(features, FileFormat.BINARY);
	JavaPairRDD<MatrixIndexes, MatrixBlock> labelsRDD = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		sec.getRDDHandleForMatrixObject(labels, FileFormat.BINARY);

	DataPartitionerSparkMapper mapper = new DataPartitionerSparkMapper(scheme, workerNum, sec, (int) features.getNumRows());
	JavaPairRDD<Integer, Tuple2<MatrixBlock, MatrixBlock>> result = ParamservUtils
		.assembleTrainingData(featuresRDD, labelsRDD) // Combine features and labels into a pair (rowBlockID => (features, labels))
		.flatMapToPair(mapper) // Do the data partitioning on spark (workerID => (rowBlockID, (single row features, single row labels))
		// Aggregate the partitioned matrix according to rowID for each worker
		// i.e. (workerID => ordered list[(rowBlockID, (single row features, single row labels)]
		.aggregateByKey(new LinkedList<Tuple2<Long, Tuple2<MatrixBlock, MatrixBlock>>>(), new Partitioner() {
			private static final long serialVersionUID = -7937781374718031224L;
			@Override
			public int getPartition(Object workerID) {
				return (int) workerID;
			}
			@Override
			public int numPartitions() {
				return workerNum;
			}
		}, (list, input) -> {
			list.add(input);
			return list;
		}, (l1, l2) -> {
			l1.addAll(l2);
			l1.sort((o1, o2) -> o1._1.compareTo(o2._1));
			return l1;
		})
		.mapToPair(new DataPartitionerSparkAggregator(features.getNumColumns(), labels.getNumColumns()));

	if (DMLScript.STATISTICS)
		Statistics.accPSSetupTime((long) tSetup.stop());
	return result;
}

Example #23

Source File: ParamservUtils.java From systemds with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public static JavaPairRDD<Integer, Tuple2<MatrixBlock, MatrixBlock>> doPartitionOnSpark(SparkExecutionContext sec, MatrixObject features, MatrixObject labels, Statement.PSScheme scheme, int workerNum) {
	Timing tSetup = DMLScript.STATISTICS ? new Timing(true) : null;
	// Get input RDD
	JavaPairRDD<MatrixIndexes, MatrixBlock> featuresRDD = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		sec.getRDDHandleForMatrixObject(features, InputInfo.BinaryBlockInputInfo);
	JavaPairRDD<MatrixIndexes, MatrixBlock> labelsRDD = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		sec.getRDDHandleForMatrixObject(labels, InputInfo.BinaryBlockInputInfo);

	DataPartitionerSparkMapper mapper = new DataPartitionerSparkMapper(scheme, workerNum, sec, (int) features.getNumRows());
	JavaPairRDD<Integer, Tuple2<MatrixBlock, MatrixBlock>> result = ParamservUtils
		.assembleTrainingData(featuresRDD, labelsRDD) // Combine features and labels into a pair (rowBlockID => (features, labels))
		.flatMapToPair(mapper) // Do the data partitioning on spark (workerID => (rowBlockID, (single row features, single row labels))
		// Aggregate the partitioned matrix according to rowID for each worker
		// i.e. (workerID => ordered list[(rowBlockID, (single row features, single row labels)]
		.aggregateByKey(new LinkedList<Tuple2<Long, Tuple2<MatrixBlock, MatrixBlock>>>(), new Partitioner() {
			private static final long serialVersionUID = -7937781374718031224L;
			@Override
			public int getPartition(Object workerID) {
				return (int) workerID;
			}
			@Override
			public int numPartitions() {
				return workerNum;
			}
		}, (list, input) -> {
			list.add(input);
			return list;
		}, (l1, l2) -> {
			l1.addAll(l2);
			l1.sort((o1, o2) -> o1._1.compareTo(o2._1));
			return l1;
		})
		.mapToPair(new DataPartitionerSparkAggregator(features.getNumColumns(), labels.getNumColumns()));

	if (DMLScript.STATISTICS)
		Statistics.accPSSetupTime((long) tSetup.stop());
	return result;
}

Example #24

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <W1, W2, W3> SparkJavaPairRDD<K, Tuple4<Iterable<V>, Iterable<W1>, Iterable<W2>, Iterable<W3>>>
cogroup(final org.apache.spark.api.java.JavaPairRDD<K, W1> other1,
        final org.apache.spark.api.java.JavaPairRDD<K, W2> other2,
        final org.apache.spark.api.java.JavaPairRDD<K, W3> other3,
        final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #25

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <W1, W2> SparkJavaPairRDD<K, Tuple3<Iterable<V>, Iterable<W1>, Iterable<W2>>>
cogroup(final org.apache.spark.api.java.JavaPairRDD<K, W1> other1,
        final org.apache.spark.api.java.JavaPairRDD<K, W2> other2,
        final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #26

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

4 votes

@Override
public <W> SparkJavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>>
fullOuterJoin(final org.apache.spark.api.java.JavaPairRDD<K, W> other,
              final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #27

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

4 votes

@Override
public SparkJavaPairRDD<K, V> foldByKey(final V zeroValue,
                                        final Partitioner partitioner,
                                        final Function2<V, V, V> func) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #28

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

4 votes

@Override
public SparkJavaPairRDD<K, Iterable<V>> groupByKey(final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #29

Source File: SourceRDD.java From beam with Apache License 2.0

4 votes

@Override
public Option<Partitioner> partitioner() {
  // setting the partitioner helps to "keep" the same partitioner in the following
  // mapWithState read for Read.Unbounded, preventing a post-mapWithState shuffle.
  return scala.Some.apply(partitioner);
}

Example #30

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

4 votes

@Override
public SparkJavaPairRDD<K, V> subtract(final org.apache.spark.api.java.JavaPairRDD<K, V> other,
                                       final Partitioner p) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}