Java Code Examples for org.apache.spark.Partitioner

The following examples show how to use org.apache.spark.Partitioner. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: systemds   Source File: MatrixIndexingSPInstruction.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Wraps the input RDD into a PartitionPruningRDD, which acts as a filter
 * of required partitions. The distinct set of required partitions is determined
 * via the partitioner of the input RDD.
 * 
 * @param in input matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param filter partition filter
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
private static JavaPairRDD<MatrixIndexes,MatrixBlock> createPartitionPruningRDD( 
		JavaPairRDD<MatrixIndexes,MatrixBlock> in, List<MatrixIndexes> filter )
{
	//build hashset of required partition ids
	HashSet<Integer> flags = new HashSet<>();
	Partitioner partitioner = in.rdd().partitioner().get();
	for( MatrixIndexes key : filter )
		flags.add(partitioner.getPartition(key));

	//create partition pruning rdd
	Function1<Object,Object> f = new PartitionPruningFunction(flags);
	PartitionPruningRDD<Tuple2<MatrixIndexes, MatrixBlock>> ppRDD = 
			PartitionPruningRDD.create(in.rdd(), f);

	//wrap output into java pair rdd
	return new JavaPairRDD<>(ppRDD, 
			ClassManifestFactory.fromClass(MatrixIndexes.class), 
			ClassManifestFactory.fromClass(MatrixBlock.class));
}
 
Example 2
Source Project: hudi   Source File: HoodieBloomIndex.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Find out <RowKey, filename> pair. All workload grouped by file-level.
 * <p>
 * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
 * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
 * <p>
 * Make sure the parallelism is atleast the groupby parallelism for tagging location
 */
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
    final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
    JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTable hoodieTable,
    Map<String, Long> fileGroupToComparisons) {
  JavaRDD<Tuple2<String, HoodieKey>> fileComparisonsRDD =
      explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);

  if (config.useBloomIndexBucketizedChecking()) {
    Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
        config.getBloomIndexKeysPerBucket());

    fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
        .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
  } else {
    fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
  }

  return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
      .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
      .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
          .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
              new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
          .collect(Collectors.toList()).iterator());
}
 
Example 3
Source Project: hudi   Source File: BaseCommitActionExecutor.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
protected Iterator<List<WriteStatus>> handleUpsertPartition(String instantTime, Integer partition, Iterator recordItr,
    Partitioner partitioner) {
  UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
  BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
  BucketType btype = binfo.bucketType;
  try {
    if (btype.equals(BucketType.INSERT)) {
      return handleInsert(binfo.fileIdPrefix, recordItr);
    } else if (btype.equals(BucketType.UPDATE)) {
      return handleUpdate(binfo.partitionPath, binfo.fileIdPrefix, recordItr);
    } else {
      throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
    }
  } catch (Throwable t) {
    String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
    LOG.error(msg, t);
    throw new HoodieUpsertException(msg, t);
  }
}
 
Example 4
Source Project: beam   Source File: GroupCombineFunctions.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * An implementation of {@link
 * org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly} for the Spark runner.
 */
public static <K, V> JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupByKeyOnly(
    JavaRDD<WindowedValue<KV<K, V>>> rdd,
    Coder<K> keyCoder,
    WindowedValueCoder<V> wvCoder,
    @Nullable Partitioner partitioner) {
  // we use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle.
  JavaPairRDD<ByteArray, byte[]> pairRDD =
      rdd.map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));

  // If no partitioner is passed, the default group by key operation is called
  JavaPairRDD<ByteArray, Iterable<byte[]>> groupedRDD =
      (partitioner != null) ? pairRDD.groupByKey(partitioner) : pairRDD.groupByKey();

  return groupedRDD
      .mapToPair(CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder))
      .map(new TranslationUtils.FromPairFunction<>());
}
 
Example 5
Source Project: beam   Source File: GroupNonMergingWindowsFunctions.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Creates composite key of K and W and group all values for that composite key with Spark's
 * repartitionAndSortWithinPartitions. Stream of sorted by composite key's is transformed to key
 * with iterator of all values for that key (via {@link GroupByKeyIterator}).
 *
 * <p>repartitionAndSortWithinPartitions is used because all values are not collected into memory
 * at once, but streamed with iterator unlike GroupByKey (it minimizes memory pressure).
 */
static <K, V, W extends BoundedWindow>
    JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupByKeyAndWindow(
        JavaRDD<WindowedValue<KV<K, V>>> rdd,
        Coder<K> keyCoder,
        Coder<V> valueCoder,
        WindowingStrategy<?, W> windowingStrategy,
        Partitioner partitioner) {
  final Coder<W> windowCoder = windowingStrategy.getWindowFn().windowCoder();
  FullWindowedValueCoder<KV<K, V>> windowedKvCoder =
      WindowedValue.FullWindowedValueCoder.of(KvCoder.of(keyCoder, valueCoder), windowCoder);
  JavaPairRDD<ByteArray, byte[]> windowInKey =
      bringWindowToKey(
          rdd, keyCoder, windowCoder, wv -> CoderHelpers.toByteArray(wv, windowedKvCoder));
  return windowInKey
      .repartitionAndSortWithinPartitions(getPartitioner(partitioner, rdd))
      .mapPartitions(
          it -> new GroupByKeyIterator<>(it, keyCoder, windowingStrategy, windowedKvCoder))
      .filter(Objects::nonNull); // filter last null element from GroupByKeyIterator
}
 
Example 6
Source Project: systemds   Source File: MatrixIndexingSPInstruction.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Wraps the input RDD into a PartitionPruningRDD, which acts as a filter
 * of required partitions. The distinct set of required partitions is determined
 * via the partitioner of the input RDD.
 * 
 * @param in input matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param filter partition filter
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
private static JavaPairRDD<MatrixIndexes,MatrixBlock> createPartitionPruningRDD( 
		JavaPairRDD<MatrixIndexes,MatrixBlock> in, List<MatrixIndexes> filter )
{
	//build hashset of required partition ids
	HashSet<Integer> flags = new HashSet<>();
	Partitioner partitioner = in.rdd().partitioner().get();
	for( MatrixIndexes key : filter )
		flags.add(partitioner.getPartition(key));

	//create partition pruning rdd
	Function1<Object,Object> f = new PartitionPruningFunction(flags);
	PartitionPruningRDD<Tuple2<MatrixIndexes, MatrixBlock>> ppRDD = 
			PartitionPruningRDD.create(in.rdd(), f);

	//wrap output into java pair rdd
	return new JavaPairRDD<>(ppRDD, 
			ClassManifestFactory.fromClass(MatrixIndexes.class), 
			ClassManifestFactory.fromClass(MatrixBlock.class));
}
 
Example 7
Source Project: systemds   Source File: ParamservUtils.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static JavaPairRDD<Integer, Tuple2<MatrixBlock, MatrixBlock>> doPartitionOnSpark(SparkExecutionContext sec, MatrixObject features, MatrixObject labels, Statement.PSScheme scheme, int workerNum) {
	Timing tSetup = DMLScript.STATISTICS ? new Timing(true) : null;
	// Get input RDD
	JavaPairRDD<MatrixIndexes, MatrixBlock> featuresRDD = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		sec.getRDDHandleForMatrixObject(features, InputInfo.BinaryBlockInputInfo);
	JavaPairRDD<MatrixIndexes, MatrixBlock> labelsRDD = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		sec.getRDDHandleForMatrixObject(labels, InputInfo.BinaryBlockInputInfo);

	DataPartitionerSparkMapper mapper = new DataPartitionerSparkMapper(scheme, workerNum, sec, (int) features.getNumRows());
	JavaPairRDD<Integer, Tuple2<MatrixBlock, MatrixBlock>> result = ParamservUtils
		.assembleTrainingData(featuresRDD, labelsRDD) // Combine features and labels into a pair (rowBlockID => (features, labels))
		.flatMapToPair(mapper) // Do the data partitioning on spark (workerID => (rowBlockID, (single row features, single row labels))
		// Aggregate the partitioned matrix according to rowID for each worker
		// i.e. (workerID => ordered list[(rowBlockID, (single row features, single row labels)]
		.aggregateByKey(new LinkedList<Tuple2<Long, Tuple2<MatrixBlock, MatrixBlock>>>(), new Partitioner() {
			private static final long serialVersionUID = -7937781374718031224L;
			@Override
			public int getPartition(Object workerID) {
				return (int) workerID;
			}
			@Override
			public int numPartitions() {
				return workerNum;
			}
		}, (list, input) -> {
			list.add(input);
			return list;
		}, (l1, l2) -> {
			l1.addAll(l2);
			l1.sort((o1, o2) -> o1._1.compareTo(o2._1));
			return l1;
		})
		.mapToPair(new DataPartitionerSparkAggregator(features.getNumColumns(), labels.getNumColumns()));

	if (DMLScript.STATISTICS)
		Statistics.accPSSetupTime((long) tSetup.stop());
	return result;
}
 
Example 8
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner,
                                               final boolean mapSideCombine,
                                               final Serializer serializer) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 9
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 10
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> SparkJavaPairRDD<K, U> aggregateByKey(final U zeroValue,
                                                 final Partitioner partitioner,
                                                 final Function2<U, V, U> seqFunc,
                                                 final Function2<U, U, U> combFunc) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 11
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <W1, W2> SparkJavaPairRDD<K, Tuple3<Iterable<V>, Iterable<W1>, Iterable<W2>>>
cogroup(final org.apache.spark.api.java.JavaPairRDD<K, W1> other1,
        final org.apache.spark.api.java.JavaPairRDD<K, W2> other2,
        final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 12
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <W1, W2, W3> SparkJavaPairRDD<K, Tuple4<Iterable<V>, Iterable<W1>, Iterable<W2>, Iterable<W3>>>
cogroup(final org.apache.spark.api.java.JavaPairRDD<K, W1> other1,
        final org.apache.spark.api.java.JavaPairRDD<K, W2> other2,
        final org.apache.spark.api.java.JavaPairRDD<K, W3> other3,
        final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 13
Source Project: hudi   Source File: CommitActionExecutor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
  if (profile == null) {
    throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
  }
  return new UpsertPartitioner(profile, jsc, table, config);
}
 
Example 14
Source Project: hudi   Source File: BaseCommitActionExecutor.java    License: Apache License 2.0 5 votes vote down vote up
public HoodieWriteMetadata execute(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
  HoodieWriteMetadata result = new HoodieWriteMetadata();
  // Cache the tagged records, so we don't end up computing both
  // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
  if (inputRecordsRDD.getStorageLevel() == StorageLevel.NONE()) {
    inputRecordsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  } else {
    LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel());
  }

  WorkloadProfile profile = null;
  if (isWorkloadProfileNeeded()) {
    profile = new WorkloadProfile(inputRecordsRDD);
    LOG.info("Workload profile :" + profile);
    saveWorkloadProfileMetadataToInflight(profile, instantTime);
  }

  // partition using the insert partitioner
  final Partitioner partitioner = getPartitioner(profile);
  JavaRDD<HoodieRecord<T>> partitionedRecords = partition(inputRecordsRDD, partitioner);
  JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {
    if (WriteOperationType.isChangingRecords(operationType)) {
      return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
    } else {
      return handleInsertPartition(instantTime, partition, recordItr, partitioner);
    }
  }, true).flatMap(List::iterator);

  updateIndexAndCommitIfNeeded(writeStatusRDD, result);
  return result;
}
 
Example 15
Source Project: hudi   Source File: BaseCommitActionExecutor.java    License: Apache License 2.0 5 votes vote down vote up
private Partitioner getPartitioner(WorkloadProfile profile) {
  if (WriteOperationType.isChangingRecords(operationType)) {
    return getUpsertPartitioner(profile);
  } else {
    return getInsertPartitioner(profile);
  }
}
 
Example 16
Source Project: hudi   Source File: DeltaCommitActionExecutor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
  if (profile == null) {
    throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
  }
  mergeOnReadUpsertPartitioner = new UpsertDeltaCommitPartitioner(profile, jsc, table, config);
  return mergeOnReadUpsertPartitioner;
}
 
Example 17
Source Project: envelope   Source File: DataStep.java    License: Apache License 2.0 5 votes vote down vote up
private Partitioner getPartitioner(JavaPairRDD<Row, Row> keyedArriving) {
  Config partitionerConfig;
  
  if (hasPartitioner()) {
    partitionerConfig = config.getConfig(PARTITIONER_TYPE);
  }
  else {
    partitionerConfig = ConfigFactory.empty().withValue(
        PartitionerFactory.TYPE_CONFIG_NAME, ConfigValueFactory.fromAnyRef("range"));
  }
  
  return PartitionerFactory.create(partitionerConfig, keyedArriving);
}
 
Example 18
Source Project: envelope   Source File: TestPartitionerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testHash() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put("type", "hash");
  
  JavaPairRDD<Row, Row> base = getDummyRDD(10);
  Config config = ConfigFactory.parseMap(configMap);
  Partitioner p = PartitionerFactory.create(config, base);
  
  assertTrue(p instanceof HashPartitioner);
  assertEquals(p.numPartitions(), 10);
}
 
Example 19
Source Project: envelope   Source File: TestPartitionerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testRange() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put("type", "range");
  
  JavaPairRDD<Row, Row> base = getDummyRDD(10);
  Config config = ConfigFactory.parseMap(configMap);
  Partitioner p = PartitionerFactory.create(config, base);
  
  assertTrue(p instanceof RangePartitioner);
  assertEquals(p.numPartitions(), 10);
}
 
Example 20
Source Project: beam   Source File: TransformTranslator.java    License: Apache License 2.0 5 votes vote down vote up
private static <K, V, OutputT> JavaPairRDD<TupleTag<?>, WindowedValue<?>> statefulParDoTransform(
    KvCoder<K, V> kvCoder,
    Coder<? extends BoundedWindow> windowCoder,
    JavaRDD<WindowedValue<KV<K, V>>> kvInRDD,
    Partitioner partitioner,
    MultiDoFnFunction<KV<K, V>, OutputT> doFnFunction,
    boolean requiresSortedInput) {
  Coder<K> keyCoder = kvCoder.getKeyCoder();

  final WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(kvCoder.getValueCoder(), windowCoder);

  if (!requiresSortedInput) {
    return GroupCombineFunctions.groupByKeyOnly(kvInRDD, keyCoder, wvCoder, partitioner)
        .map(
            input -> {
              final K key = input.getKey();
              Iterable<WindowedValue<V>> value = input.getValue();
              return FluentIterable.from(value)
                  .transform(
                      windowedValue ->
                          windowedValue.withValue(KV.of(key, windowedValue.getValue())))
                  .iterator();
            })
        .flatMapToPair(doFnFunction);
  }

  JavaPairRDD<ByteArray, byte[]> pairRDD =
      kvInRDD
          .map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(
              CoderHelpers.toByteFunctionWithTs(keyCoder, wvCoder, in -> in._2().getTimestamp()));

  JavaPairRDD<ByteArray, byte[]> sorted =
      pairRDD.repartitionAndSortWithinPartitions(keyPrefixPartitionerFrom(partitioner));

  return sorted.mapPartitionsToPair(wrapDoFnFromSortedRDD(doFnFunction, keyCoder, wvCoder));
}
 
Example 21
Source Project: beam   Source File: TransformTranslator.java    License: Apache License 2.0 5 votes vote down vote up
private static Partitioner keyPrefixPartitionerFrom(Partitioner partitioner) {
  return new Partitioner() {
    @Override
    public int numPartitions() {
      return partitioner.numPartitions();
    }

    @Override
    public int getPartition(Object o) {
      ByteArray b = (ByteArray) o;
      return partitioner.getPartition(
          new ByteArray(Arrays.copyOfRange(b.getValue(), 0, b.getValue().length - 8)));
    }
  };
}
 
Example 22
Source Project: beam   Source File: TransformTranslator.java    License: Apache License 2.0 5 votes vote down vote up
@Nullable
private static Partitioner getPartitioner(EvaluationContext context) {
  Long bundleSize =
      context.getSerializableOptions().get().as(SparkPipelineOptions.class).getBundleSize();
  return (bundleSize > 0)
      ? null
      : new HashPartitioner(context.getSparkContext().defaultParallelism());
}
 
Example 23
Source Project: beam   Source File: SparkBatchPortablePipelineTranslator.java    License: Apache License 2.0 5 votes vote down vote up
private static <K, V> void translateGroupByKey(
    PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {

  RunnerApi.Components components = pipeline.getComponents();
  String inputId = getInputId(transformNode);
  Dataset inputDataset = context.popDataset(inputId);
  JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD();
  WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components);
  KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
  Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();
  Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
  WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components);
  WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
  WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder());

  JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow;
  Partitioner partitioner = getPartitioner(context);
  if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
    // we can have a memory sensitive translation for non-merging windows
    groupedByKeyAndWindow =
        GroupNonMergingWindowsFunctions.groupByKeyAndWindow(
            inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner);
  } else {
    JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly =
        GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner);
    // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
    groupedByKeyAndWindow =
        groupedByKeyOnly.flatMap(
            new SparkGroupAlsoByWindowViaOutputBufferFn<>(
                windowingStrategy,
                new TranslationUtils.InMemoryStateInternalsFactory<>(),
                SystemReduceFn.buffering(inputValueCoder),
                context.serializablePipelineOptions));
  }
  context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow));
}
 
Example 24
Source Project: beam   Source File: SparkBatchPortablePipelineTranslator.java    License: Apache License 2.0 5 votes vote down vote up
@Nullable
private static Partitioner getPartitioner(SparkTranslationContext context) {
  Long bundleSize =
      context.serializablePipelineOptions.get().as(SparkPipelineOptions.class).getBundleSize();
  return (bundleSize > 0)
      ? null
      : new HashPartitioner(context.getSparkContext().defaultParallelism());
}
 
Example 25
Source Project: systemds   Source File: ParamservUtils.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static JavaPairRDD<Integer, Tuple2<MatrixBlock, MatrixBlock>> doPartitionOnSpark(SparkExecutionContext sec, MatrixObject features, MatrixObject labels, Statement.PSScheme scheme, int workerNum) {
	Timing tSetup = DMLScript.STATISTICS ? new Timing(true) : null;
	// Get input RDD
	JavaPairRDD<MatrixIndexes, MatrixBlock> featuresRDD = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		sec.getRDDHandleForMatrixObject(features, FileFormat.BINARY);
	JavaPairRDD<MatrixIndexes, MatrixBlock> labelsRDD = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		sec.getRDDHandleForMatrixObject(labels, FileFormat.BINARY);

	DataPartitionerSparkMapper mapper = new DataPartitionerSparkMapper(scheme, workerNum, sec, (int) features.getNumRows());
	JavaPairRDD<Integer, Tuple2<MatrixBlock, MatrixBlock>> result = ParamservUtils
		.assembleTrainingData(featuresRDD, labelsRDD) // Combine features and labels into a pair (rowBlockID => (features, labels))
		.flatMapToPair(mapper) // Do the data partitioning on spark (workerID => (rowBlockID, (single row features, single row labels))
		// Aggregate the partitioned matrix according to rowID for each worker
		// i.e. (workerID => ordered list[(rowBlockID, (single row features, single row labels)]
		.aggregateByKey(new LinkedList<Tuple2<Long, Tuple2<MatrixBlock, MatrixBlock>>>(), new Partitioner() {
			private static final long serialVersionUID = -7937781374718031224L;
			@Override
			public int getPartition(Object workerID) {
				return (int) workerID;
			}
			@Override
			public int numPartitions() {
				return workerNum;
			}
		}, (list, input) -> {
			list.add(input);
			return list;
		}, (l1, l2) -> {
			l1.addAll(l2);
			l1.sort((o1, o2) -> o1._1.compareTo(o2._1));
			return l1;
		})
		.mapToPair(new DataPartitionerSparkAggregator(features.getNumColumns(), labels.getNumColumns()));

	if (DMLScript.STATISTICS)
		Statistics.accPSSetupTime((long) tSetup.stop());
	return result;
}
 
Example 26
Source Project: incubator-nemo   Source File: SparkJavaRDD.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public org.apache.spark.api.java.Optional<Partitioner> partitioner() {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 27
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public SparkJavaPairRDD<K, V> reduceByKey(final Partitioner partitioner,
                                          final Function2<V, V, V> func) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 28
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public SparkJavaPairRDD<K, V> foldByKey(final V zeroValue,
                                        final Partitioner partitioner,
                                        final Function2<V, V, V> func) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 29
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public SparkJavaPairRDD<K, Iterable<V>> groupByKey(final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 30
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public SparkJavaPairRDD<K, V> subtract(final org.apache.spark.api.java.JavaPairRDD<K, V> other,
                                       final Partitioner p) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}