Java Code Examples for org.apache.beam.sdk.coders.KvCoder#getKeyCoder()

The following examples show how to use org.apache.beam.sdk.coders.KvCoder#getKeyCoder() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Combine.java    From beam with Apache License 2.0 6 votes vote down vote up
private KvCoder<K, InputT> getKvCoder(
    Coder<? extends KV<K, ? extends Iterable<InputT>>> inputCoder) {
  if (!(inputCoder instanceof KvCoder)) {
    throw new IllegalStateException("Combine.GroupedValues requires its input to use KvCoder");
  }
  @SuppressWarnings({"unchecked", "rawtypes"})
  KvCoder<K, ? extends Iterable<InputT>> kvCoder = (KvCoder) inputCoder;
  Coder<K> keyCoder = kvCoder.getKeyCoder();
  Coder<? extends Iterable<InputT>> kvValueCoder = kvCoder.getValueCoder();
  if (!(kvValueCoder instanceof IterableCoder)) {
    throw new IllegalStateException(
        "Combine.GroupedValues requires its input values to use " + "IterableCoder");
  }
  @SuppressWarnings("unchecked")
  IterableCoder<InputT> inputValuesCoder = (IterableCoder<InputT>) kvValueCoder;
  Coder<InputT> inputValueCoder = inputValuesCoder.getElemCoder();
  return KvCoder.of(keyCoder, inputValueCoder);
}
 
Example 2
Source File: GroupByKeyViaGroupByKeyOnly.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<K, Iterable<V>>> expand(
    PCollection<KV<K, Iterable<WindowedValue<V>>>> input) {
  @SuppressWarnings("unchecked")
  KvCoder<K, Iterable<WindowedValue<V>>> inputKvCoder =
      (KvCoder<K, Iterable<WindowedValue<V>>>) input.getCoder();

  Coder<K> keyCoder = inputKvCoder.getKeyCoder();
  Coder<Iterable<WindowedValue<V>>> inputValueCoder = inputKvCoder.getValueCoder();

  IterableCoder<WindowedValue<V>> inputIterableValueCoder =
      (IterableCoder<WindowedValue<V>>) inputValueCoder;
  Coder<WindowedValue<V>> inputIterableElementCoder = inputIterableValueCoder.getElemCoder();
  WindowedValueCoder<V> inputIterableWindowedValueCoder =
      (WindowedValueCoder<V>) inputIterableElementCoder;

  Coder<V> inputIterableElementValueCoder = inputIterableWindowedValueCoder.getValueCoder();
  Coder<Iterable<V>> outputValueCoder = IterableCoder.of(inputIterableElementValueCoder);
  Coder<KV<K, Iterable<V>>> outputKvCoder = KvCoder.of(keyCoder, outputValueCoder);

  return PCollection.createPrimitiveOutputInternal(
      input.getPipeline(), windowingStrategy, input.isBounded(), outputKvCoder);
}
 
Example 3
Source File: PartitioningShuffleReader.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Given a {@code WindowedValueCoder<KV<K, V>>}, splits it into a coder for K and a {@code
 * WindowedValueCoder<V>} with the same kind of windows.
 */
private void initCoder(Coder<WindowedValue<KV<K, V>>> coder) throws Exception {
  if (!(coder instanceof WindowedValueCoder)) {
    throw new Exception("unexpected kind of coder for WindowedValue: " + coder);
  }
  WindowedValueCoder<KV<K, V>> windowedElemCoder = ((WindowedValueCoder<KV<K, V>>) coder);
  Coder<KV<K, V>> elemCoder = windowedElemCoder.getValueCoder();
  if (!(elemCoder instanceof KvCoder)) {
    throw new Exception(
        "unexpected kind of coder for elements read from "
            + "a key-partitioning shuffle: "
            + elemCoder);
  }
  @SuppressWarnings("unchecked")
  KvCoder<K, V> kvCoder = (KvCoder<K, V>) elemCoder;
  this.keyCoder = kvCoder.getKeyCoder();
  windowedValueCoder = windowedElemCoder.withValueCoder(kvCoder.getValueCoder());
}
 
Example 4
Source File: KeyedPCollectionTuple.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, V> Coder<K> getKeyCoder(PCollection<KV<K, V>> pc) {
  // TODO: This should already have run coder inference for output, but may not have been consumed
  // as input yet (and won't be fully specified); This is fine

  // Assumes that the PCollection uses a KvCoder.
  Coder<?> entryCoder = pc.getCoder();
  if (!(entryCoder instanceof KvCoder<?, ?>)) {
    throw new IllegalArgumentException("PCollection does not use a KvCoder");
  }
  @SuppressWarnings("unchecked")
  KvCoder<K, V> coder = (KvCoder<K, V>) entryCoder;
  return coder.getKeyCoder();
}
 
Example 5
Source File: GroupByKeyTranslatorBatch.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(GroupByKey<K, V> transform, Twister2BatchTranslationContext context) {
  PCollection<KV<K, V>> input = context.getInput(transform);
  BatchTSetImpl<WindowedValue<KV<K, V>>> inputTTSet = context.getInputDataSet(input);
  final KvCoder<K, V> coder = (KvCoder<K, V>) input.getCoder();
  Coder<K> inputKeyCoder = coder.getKeyCoder();
  WindowingStrategy windowingStrategy = input.getWindowingStrategy();
  WindowFn<KV<K, V>, BoundedWindow> windowFn =
      (WindowFn<KV<K, V>, BoundedWindow>) windowingStrategy.getWindowFn();
  final WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
  KeyedTSet<byte[], byte[]> keyedTSet =
      inputTTSet.mapToTuple(new MapToTupleFunction<K, V>(inputKeyCoder, wvCoder));

  // todo add support for a partition function to be specified, this would use
  // todo keyedPartition function instead of KeyedGather
  ComputeTSet<KV<K, Iterable<WindowedValue<V>>>, Iterator<Tuple<byte[], Iterator<byte[]>>>>
      groupedbyKeyTset =
          keyedTSet.keyedGather().map(new ByteToWindowFunction(inputKeyCoder, wvCoder));

  // --- now group also by window.
  SystemReduceFnBuffering reduceFnBuffering = new SystemReduceFnBuffering(coder.getValueCoder());
  ComputeTSet<WindowedValue<KV<K, Iterable<V>>>, Iterable<KV<K, Iterator<WindowedValue<V>>>>>
      outputTset =
          groupedbyKeyTset
              .direct()
              .<WindowedValue<KV<K, Iterable<V>>>>flatmap(
                  new GroupByWindowFunction(
                      windowingStrategy, reduceFnBuffering, context.getOptions()));
  PCollection output = context.getOutput(transform);
  context.setOutputDataSet(output, outputTset);
}
 
Example 6
Source File: PCollectionViewTranslatorBatch.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(
    View.CreatePCollectionView<ElemT, ViewT> transform, Twister2BatchTranslationContext context) {
  BatchTSet<WindowedValue<ElemT>> inputDataSet =
      context.getInputDataSet(context.getInput(transform));
  @SuppressWarnings("unchecked")
  AppliedPTransform<
          PCollection<ElemT>,
          PCollection<ElemT>,
          PTransform<PCollection<ElemT>, PCollection<ElemT>>>
      application =
          (AppliedPTransform<
                  PCollection<ElemT>,
                  PCollection<ElemT>,
                  PTransform<PCollection<ElemT>, PCollection<ElemT>>>)
              context.getCurrentTransform();
  org.apache.beam.sdk.values.PCollectionView<ViewT> input;
  PCollection<ElemT> inputPCol = context.getInput(transform);
  final KvCoder coder = (KvCoder) inputPCol.getCoder();
  Coder inputKeyCoder = coder.getKeyCoder();
  WindowingStrategy windowingStrategy = inputPCol.getWindowingStrategy();
  WindowFn windowFn = windowingStrategy.getWindowFn();
  final WindowedValue.WindowedValueCoder wvCoder =
      WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
  BatchTSet<WindowedValue<ElemT>> inputGathered =
      inputDataSet
          .direct()
          .map(new MapToTupleFunction<>(inputKeyCoder, wvCoder))
          .allGather()
          .map(new ByteToWindowFunctionPrimitive(inputKeyCoder, wvCoder));
  try {
    input = CreatePCollectionViewTranslation.getView(application);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  context.setSideInputDataSet(input.getTagInternal().getId(), inputGathered);
}
 
Example 7
Source File: GroupByKeyTranslatorBatch.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void translateTransform(
    PTransform<PCollection<KV<K, V>>, PCollection<KV<K, Iterable<V>>>> transform,
    TranslationContext context) {

  @SuppressWarnings("unchecked")
  final PCollection<KV<K, V>> inputPCollection = (PCollection<KV<K, V>>) context.getInput();
  Dataset<WindowedValue<KV<K, V>>> input = context.getDataset(inputPCollection);
  WindowingStrategy<?, ?> windowingStrategy = inputPCollection.getWindowingStrategy();
  KvCoder<K, V> kvCoder = (KvCoder<K, V>) inputPCollection.getCoder();
  Coder<V> valueCoder = kvCoder.getValueCoder();

  // group by key only
  Coder<K> keyCoder = kvCoder.getKeyCoder();
  KeyValueGroupedDataset<K, WindowedValue<KV<K, V>>> groupByKeyOnly =
      input.groupByKey(KVHelpers.extractKey(), EncoderHelpers.fromBeamCoder(keyCoder));

  // group also by windows
  WindowedValue.FullWindowedValueCoder<KV<K, Iterable<V>>> outputCoder =
      WindowedValue.FullWindowedValueCoder.of(
          KvCoder.of(keyCoder, IterableCoder.of(valueCoder)),
          windowingStrategy.getWindowFn().windowCoder());
  Dataset<WindowedValue<KV<K, Iterable<V>>>> output =
      groupByKeyOnly.flatMapGroups(
          new GroupAlsoByWindowViaOutputBufferFn<>(
              windowingStrategy,
              new InMemoryStateInternalsFactory<>(),
              SystemReduceFn.buffering(valueCoder),
              context.getSerializableOptions()),
          EncoderHelpers.fromBeamCoder(outputCoder));

  context.putDataset(context.getOutput(), output);
}
 
Example 8
Source File: TransformTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, V, OutputT> JavaPairRDD<TupleTag<?>, WindowedValue<?>> statefulParDoTransform(
    KvCoder<K, V> kvCoder,
    Coder<? extends BoundedWindow> windowCoder,
    JavaRDD<WindowedValue<KV<K, V>>> kvInRDD,
    Partitioner partitioner,
    MultiDoFnFunction<KV<K, V>, OutputT> doFnFunction,
    boolean requiresSortedInput) {
  Coder<K> keyCoder = kvCoder.getKeyCoder();

  final WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(kvCoder.getValueCoder(), windowCoder);

  if (!requiresSortedInput) {
    return GroupCombineFunctions.groupByKeyOnly(kvInRDD, keyCoder, wvCoder, partitioner)
        .map(
            input -> {
              final K key = input.getKey();
              Iterable<WindowedValue<V>> value = input.getValue();
              return FluentIterable.from(value)
                  .transform(
                      windowedValue ->
                          windowedValue.withValue(KV.of(key, windowedValue.getValue())))
                  .iterator();
            })
        .flatMapToPair(doFnFunction);
  }

  JavaPairRDD<ByteArray, byte[]> pairRDD =
      kvInRDD
          .map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(
              CoderHelpers.toByteFunctionWithTs(keyCoder, wvCoder, in -> in._2().getTimestamp()));

  JavaPairRDD<ByteArray, byte[]> sorted =
      pairRDD.repartitionAndSortWithinPartitions(keyPrefixPartitionerFrom(partitioner));

  return sorted.mapPartitionsToPair(wrapDoFnFromSortedRDD(doFnFunction, keyCoder, wvCoder));
}
 
Example 9
Source File: SparkBatchPortablePipelineTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, V> void translateGroupByKey(
    PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {

  RunnerApi.Components components = pipeline.getComponents();
  String inputId = getInputId(transformNode);
  Dataset inputDataset = context.popDataset(inputId);
  JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD();
  WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components);
  KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
  Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();
  Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
  WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components);
  WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
  WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder());

  JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow;
  Partitioner partitioner = getPartitioner(context);
  if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
    // we can have a memory sensitive translation for non-merging windows
    groupedByKeyAndWindow =
        GroupNonMergingWindowsFunctions.groupByKeyAndWindow(
            inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner);
  } else {
    JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly =
        GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner);
    // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
    groupedByKeyAndWindow =
        groupedByKeyOnly.flatMap(
            new SparkGroupAlsoByWindowViaOutputBufferFn<>(
                windowingStrategy,
                new TranslationUtils.InMemoryStateInternalsFactory<>(),
                SystemReduceFn.buffering(inputValueCoder),
                context.serializablePipelineOptions));
  }
  context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow));
}
 
Example 10
Source File: GroupingShuffleReader.java    From beam with Apache License 2.0 5 votes vote down vote up
private void initCoder(Coder<WindowedValue<KV<K, Iterable<V>>>> coder, boolean valuesAreSorted)
    throws Exception {
  if (!(coder instanceof WindowedValueCoder)) {
    throw new Exception("unexpected kind of coder for WindowedValue: " + coder);
  }
  Coder<KV<K, Iterable<V>>> elemCoder =
      ((WindowedValueCoder<KV<K, Iterable<V>>>) coder).getValueCoder();
  if (!(elemCoder instanceof KvCoder)) {
    throw new Exception(
        "unexpected kind of coder for elements read from "
            + "a key-grouping shuffle: "
            + elemCoder);
  }

  @SuppressWarnings("unchecked")
  KvCoder<K, Iterable<V>> kvCoder = (KvCoder<K, Iterable<V>>) elemCoder;
  this.keyCoder = kvCoder.getKeyCoder();
  Coder<Iterable<V>> kvValueCoder = kvCoder.getValueCoder();
  if (!(kvValueCoder instanceof IterableCoder)) {
    throw new Exception(
        "unexpected kind of coder for values of KVs read from " + "a key-grouping shuffle");
  }
  IterableCoder<V> iterCoder = (IterableCoder<V>) kvValueCoder;
  if (valuesAreSorted) {
    checkState(
        iterCoder.getElemCoder() instanceof KvCoder,
        "unexpected kind of coder for elements read from a "
            + "key-grouping value sorting shuffle: %s",
        iterCoder.getElemCoder());
    @SuppressWarnings("rawtypes")
    KvCoder<?, ?> valueKvCoder = (KvCoder) iterCoder.getElemCoder();
    this.secondaryKeyCoder = valueKvCoder.getKeyCoder();
    this.valueCoder = valueKvCoder.getValueCoder();
  } else {
    this.valueCoder = iterCoder.getElemCoder();
  }
}
 
Example 11
Source File: BatchStatefulParDoOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<K, Iterable<KV<Instant, WindowedValue<KV<K, V>>>>>> expand(
    PCollection<KV<K, V>> input) {

  WindowingStrategy<?, ?> inputWindowingStrategy = input.getWindowingStrategy();

  // A KvCoder is required since this goes through GBK. Further, WindowedValueCoder
  // is not registered by default, so we explicitly set the relevant coders.
  checkState(
      input.getCoder() instanceof KvCoder,
      "Input to a %s using state requires a %s, but the coder was %s",
      ParDo.class.getSimpleName(),
      KvCoder.class.getSimpleName(),
      input.getCoder());
  KvCoder<K, V> kvCoder = (KvCoder<K, V>) input.getCoder();
  Coder<K> keyCoder = kvCoder.getKeyCoder();
  Coder<? extends BoundedWindow> windowCoder =
      inputWindowingStrategy.getWindowFn().windowCoder();

  return input
      // Stash the original timestamps, etc, for when it is fed to the user's DoFn
      .apply("ReifyWindows", ParDo.of(new ReifyWindowedValueFn<>()))
      .setCoder(
          KvCoder.of(
              keyCoder,
              KvCoder.of(InstantCoder.of(), WindowedValue.getFullCoder(kvCoder, windowCoder))))

      // Group by key and sort by timestamp, dropping windows as they are reified
      .apply("PartitionKeys", new GroupByKeyAndSortValuesOnly<>())

      // The GBKO sets the windowing strategy to the global default
      .setWindowingStrategyInternal(inputWindowingStrategy);
}
 
Example 12
Source File: ParDoMultiOverrideFactory.java    From beam with Apache License 2.0 4 votes vote down vote up
@VisibleForTesting
PCollection<KeyedWorkItem<K, KV<K, InputT>>> groupToKeyedWorkItem(
    PCollection<KV<K, InputT>> input) {

  WindowingStrategy<?, ?> inputWindowingStrategy = input.getWindowingStrategy();

  // A KvCoder is required since this goes through GBK. Further, WindowedValueCoder
  // is not registered by default, so we explicitly set the relevant coders.
  checkState(
      input.getCoder() instanceof KvCoder,
      "Input to a %s using state requires a %s, but the coder was %s",
      ParDo.class.getSimpleName(),
      KvCoder.class.getSimpleName(),
      input.getCoder());

  KvCoder<K, InputT> kvCoder = (KvCoder<K, InputT>) input.getCoder();
  Coder<K> keyCoder = kvCoder.getKeyCoder();
  Coder<? extends BoundedWindow> windowCoder =
      inputWindowingStrategy.getWindowFn().windowCoder();

  return input
      // Stash the original timestamps, etc, for when it is fed to the user's DoFn
      .apply("Reify timestamps", ParDo.of(new ReifyWindowedValueFn<>()))
      .setCoder(KvCoder.of(keyCoder, WindowedValue.getFullCoder(kvCoder, windowCoder)))

      // We are going to GBK to gather keys and windows but otherwise do not want
      // to alter the flow of data. This entails:
      //  - trigger as fast as possible
      //  - maintain the full timestamps of elements
      //  - ensure this GBK holds to the minimum of those timestamps (via TimestampCombiner)
      //  - discard past panes as it is "just a stream" of elements
      .apply(
          Window.<KV<K, WindowedValue<KV<K, InputT>>>>configure()
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .discardingFiredPanes()
              .withAllowedLateness(inputWindowingStrategy.getAllowedLateness())
              .withTimestampCombiner(TimestampCombiner.EARLIEST))

      // A full GBK to group by key _and_ window
      .apply("Group by key", GroupByKey.create())

      // Adapt to KeyedWorkItem; that is how this runner delivers timers
      .apply("To KeyedWorkItem", ParDo.of(new ToKeyedWorkItem<>()))
      .setCoder(KeyedWorkItemCoder.of(keyCoder, kvCoder, windowCoder))

      // Because of the intervening GBK, we may have abused the windowing strategy
      // of the input, which should be transferred to the output in a straightforward manner
      // according to what ParDo already does.
      .setWindowingStrategyInternal(inputWindowingStrategy);
}
 
Example 13
Source File: FnApiStateAccessor.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
@Nullable
public <T> T get(PCollectionView<T> view, BoundedWindow window) {
  TupleTag<?> tag = view.getTagInternal();

  SideInputSpec sideInputSpec = sideInputSpecMap.get(tag);
  checkArgument(sideInputSpec != null, "Attempting to access unknown side input %s.", view);

  ByteString.Output encodedWindowOut = ByteString.newOutput();
  try {
    sideInputSpec
        .getWindowCoder()
        .encode(sideInputSpec.getWindowMappingFn().getSideInputWindow(window), encodedWindowOut);
  } catch (IOException e) {
    throw new IllegalStateException(e);
  }
  ByteString encodedWindow = encodedWindowOut.toByteString();
  StateKey.Builder cacheKeyBuilder = StateKey.newBuilder();
  Object sideInputAccessor;

  switch (sideInputSpec.getAccessPattern()) {
    case Materializations.ITERABLE_MATERIALIZATION_URN:
      cacheKeyBuilder
          .getIterableSideInputBuilder()
          .setTransformId(ptransformId)
          .setSideInputId(tag.getId())
          .setWindow(encodedWindow);
      sideInputAccessor =
          new IterableSideInput<>(
              beamFnStateClient,
              processBundleInstructionId.get(),
              ptransformId,
              tag.getId(),
              encodedWindow,
              sideInputSpec.getCoder());
      break;

    case Materializations.MULTIMAP_MATERIALIZATION_URN:
      checkState(
          sideInputSpec.getCoder() instanceof KvCoder,
          "Expected %s but received %s.",
          KvCoder.class,
          sideInputSpec.getCoder().getClass());
      KvCoder<?, ?> kvCoder = (KvCoder) sideInputSpec.getCoder();
      cacheKeyBuilder
          .getMultimapSideInputBuilder()
          .setTransformId(ptransformId)
          .setSideInputId(tag.getId())
          .setWindow(encodedWindow);
      sideInputAccessor =
          new MultimapSideInput<>(
              beamFnStateClient,
              processBundleInstructionId.get(),
              ptransformId,
              tag.getId(),
              encodedWindow,
              kvCoder.getKeyCoder(),
              kvCoder.getValueCoder());
      break;

    default:
      throw new IllegalStateException(
          String.format(
              "This SDK is only capable of dealing with %s materializations "
                  + "but was asked to handle %s for PCollectionView with tag %s.",
              ImmutableList.of(
                  Materializations.ITERABLE_MATERIALIZATION_URN,
                  Materializations.MULTIMAP_MATERIALIZATION_URN),
              sideInputSpec.getAccessPattern(),
              tag));
  }

  return (T)
      stateKeyObjectCache.computeIfAbsent(
          cacheKeyBuilder.build(), key -> sideInputSpec.getViewFn().apply(sideInputAccessor));
}
 
Example 14
Source File: DataflowSideInputHandlerFactory.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public <K, V, W extends BoundedWindow> MultimapSideInputHandler<K, V, W> forMultimapSideInput(
    String pTransformId, String sideInputId, KvCoder<K, V> elementCoder, Coder<W> windowCoder) {
  checkArgument(
      pTransformId != null && pTransformId.length() > 0, "Expect a valid PTransform ID.");

  SideInputReader sideInputReader = ptransformIdToSideInputReader.get(pTransformId);
  checkState(sideInputReader != null, String.format("Unknown PTransform '%s'", pTransformId));

  PCollectionView<Materializations.MultimapView<Object, Object>> view =
      (PCollectionView<Materializations.MultimapView<Object, Object>>)
          sideInputIdToPCollectionViewMap.get(
              RunnerApi.ExecutableStagePayload.SideInputId.newBuilder()
                  .setTransformId(pTransformId)
                  .setLocalName(sideInputId)
                  .build());
  checkState(
      view != null,
      String.format("Unknown side input '%s' on PTransform '%s'", sideInputId, pTransformId));

  checkState(
      Materializations.MULTIMAP_MATERIALIZATION_URN.equals(
          view.getViewFn().getMaterialization().getUrn()),
      String.format(
          "Unknown materialization for side input '%s' on PTransform '%s' with urn '%s'",
          sideInputId, pTransformId, view.getViewFn().getMaterialization().getUrn()));

  checkState(
      view.getCoderInternal() instanceof KvCoder,
      String.format(
          "Materialization of side input '%s' on PTransform '%s' expects %s but received %s.",
          sideInputId,
          pTransformId,
          KvCoder.class.getSimpleName(),
          view.getCoderInternal().getClass().getSimpleName()));

  KvCoder<K, V> kvCoder = elementCoder;

  return new DataflowMultimapSideInputHandler<>(
      sideInputReader, view, kvCoder.getKeyCoder(), kvCoder.getValueCoder(), windowCoder);
}
 
Example 15
Source File: ShuffleSink.java    From beam with Apache License 2.0 4 votes vote down vote up
private void initCoder(Coder<WindowedValue<T>> coder) throws Exception {
  switch (shuffleKind) {
    case UNGROUPED:
      this.shardByKey = false;
      this.groupValues = false;
      this.sortValues = false;
      break;
    case PARTITION_KEYS:
      this.shardByKey = true;
      this.groupValues = false;
      this.sortValues = false;
      break;
    case GROUP_KEYS:
      this.shardByKey = true;
      this.groupValues = true;
      this.sortValues = false;
      break;
    case GROUP_KEYS_AND_SORT_VALUES:
      this.shardByKey = true;
      this.groupValues = true;
      this.sortValues = true;
      break;
    default:
      throw new AssertionError("unexpected shuffle kind");
  }

  this.windowedElemCoder = (WindowedValueCoder<T>) coder;
  this.elemCoder = windowedElemCoder.getValueCoder();
  if (shardByKey) {
    if (!(elemCoder instanceof KvCoder)) {
      throw new Exception(
          String.format(
              "Unexpected kind of coder for elements written to a key-grouping shuffle %s.",
              elemCoder));
    }
    KvCoder<?, ?> kvCoder = (KvCoder<?, ?>) elemCoder;
    this.keyCoder = kvCoder.getKeyCoder();
    this.valueCoder = kvCoder.getValueCoder();
    if (sortValues) {
      // TODO: Decide the representation of sort-keyed values.
      // For now, we'll just use KVs.
      if (!(valueCoder instanceof KvCoder)) {
        throw new Exception(
            String.format(
                "Unexpected kind of coder for values written to a value-sorting shuffle %s.",
                valueCoder));
      }
      KvCoder<?, ?> kvValueCoder = (KvCoder<?, ?>) valueCoder;
      this.sortKeyCoder = kvValueCoder.getKeyCoder();
      this.sortValueCoder = kvValueCoder.getValueCoder();
    } else {
      this.sortKeyCoder = null;
      this.sortValueCoder = null;
    }
    if (groupValues) {
      this.windowedValueCoder = null;
    } else {
      this.windowedValueCoder = this.windowedElemCoder.withValueCoder(this.valueCoder);
    }
  } else {
    this.keyCoder = null;
    this.valueCoder = null;
    this.sortKeyCoder = null;
    this.sortValueCoder = null;
    this.windowedValueCoder = null;
  }
}
 
Example 16
Source File: FlinkStreamingPortablePipelineTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private <K, V> SingleOutputStreamOperator<WindowedValue<KV<K, Iterable<V>>>> addGBK(
    DataStream<WindowedValue<KV<K, V>>> inputDataStream,
    WindowingStrategy<?, ?> windowingStrategy,
    WindowedValueCoder<KV<K, V>> windowedInputCoder,
    String operatorName,
    StreamingTranslationContext context) {
  KvCoder<K, V> inputElementCoder = (KvCoder<K, V>) windowedInputCoder.getValueCoder();

  SingletonKeyedWorkItemCoder<K, V> workItemCoder =
      SingletonKeyedWorkItemCoder.of(
          inputElementCoder.getKeyCoder(),
          inputElementCoder.getValueCoder(),
          windowingStrategy.getWindowFn().windowCoder());

  WindowedValue.FullWindowedValueCoder<SingletonKeyedWorkItem<K, V>> windowedWorkItemCoder =
      WindowedValue.getFullCoder(workItemCoder, windowingStrategy.getWindowFn().windowCoder());

  CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, V>>> workItemTypeInfo =
      new CoderTypeInformation<>(windowedWorkItemCoder);

  DataStream<WindowedValue<SingletonKeyedWorkItem<K, V>>> workItemStream =
      inputDataStream
          .flatMap(
              new FlinkStreamingTransformTranslators.ToKeyedWorkItem<>(
                  context.getPipelineOptions()))
          .returns(workItemTypeInfo)
          .name("ToKeyedWorkItem");

  WorkItemKeySelector<K, V> keySelector =
      new WorkItemKeySelector<>(inputElementCoder.getKeyCoder());

  KeyedStream<WindowedValue<SingletonKeyedWorkItem<K, V>>, ByteBuffer> keyedWorkItemStream =
      workItemStream.keyBy(keySelector);

  SystemReduceFn<K, V, Iterable<V>, Iterable<V>, BoundedWindow> reduceFn =
      SystemReduceFn.buffering(inputElementCoder.getValueCoder());

  Coder<Iterable<V>> accumulatorCoder = IterableCoder.of(inputElementCoder.getValueCoder());

  Coder<WindowedValue<KV<K, Iterable<V>>>> outputCoder =
      WindowedValue.getFullCoder(
          KvCoder.of(inputElementCoder.getKeyCoder(), accumulatorCoder),
          windowingStrategy.getWindowFn().windowCoder());

  TypeInformation<WindowedValue<KV<K, Iterable<V>>>> outputTypeInfo =
      new CoderTypeInformation<>(outputCoder);

  TupleTag<KV<K, Iterable<V>>> mainTag = new TupleTag<>("main output");

  WindowDoFnOperator<K, V, Iterable<V>> doFnOperator =
      new WindowDoFnOperator<>(
          reduceFn,
          operatorName,
          (Coder) windowedWorkItemCoder,
          mainTag,
          Collections.emptyList(),
          new DoFnOperator.MultiOutputOutputManagerFactory(mainTag, outputCoder),
          windowingStrategy,
          new HashMap<>(), /* side-input mapping */
          Collections.emptyList(), /* side inputs */
          context.getPipelineOptions(),
          inputElementCoder.getKeyCoder(),
          (KeySelector) keySelector /* key selector */);

  SingleOutputStreamOperator<WindowedValue<KV<K, Iterable<V>>>> outputDataStream =
      keyedWorkItemStream.transform(
          operatorName, outputTypeInfo, (OneInputStreamOperator) doFnOperator);

  return outputDataStream;
}
 
Example 17
Source File: FlinkStreamingTransformTranslators.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void translateNode(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, Iterable<InputT>>>> transform,
    FlinkStreamingTranslationContext context) {

  PCollection<KV<K, InputT>> input = context.getInput(transform);

  @SuppressWarnings("unchecked")
  WindowingStrategy<?, BoundedWindow> windowingStrategy =
      (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();

  KvCoder<K, InputT> inputKvCoder = (KvCoder<K, InputT>) input.getCoder();

  SingletonKeyedWorkItemCoder<K, InputT> workItemCoder =
      SingletonKeyedWorkItemCoder.of(
          inputKvCoder.getKeyCoder(),
          inputKvCoder.getValueCoder(),
          input.getWindowingStrategy().getWindowFn().windowCoder());

  DataStream<WindowedValue<KV<K, InputT>>> inputDataStream = context.getInputDataStream(input);

  WindowedValue.FullWindowedValueCoder<SingletonKeyedWorkItem<K, InputT>>
      windowedWorkItemCoder =
          WindowedValue.getFullCoder(
              workItemCoder, input.getWindowingStrategy().getWindowFn().windowCoder());

  CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemTypeInfo =
      new CoderTypeInformation<>(windowedWorkItemCoder);

  DataStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemStream =
      inputDataStream
          .flatMap(new ToKeyedWorkItem<>(context.getPipelineOptions()))
          .returns(workItemTypeInfo)
          .name("ToKeyedWorkItem");

  WorkItemKeySelector keySelector = new WorkItemKeySelector<>(inputKvCoder.getKeyCoder());

  KeyedStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>, ByteBuffer>
      keyedWorkItemStream =
          workItemStream.keyBy(new WorkItemKeySelector<>(inputKvCoder.getKeyCoder()));

  SystemReduceFn<K, InputT, Iterable<InputT>, Iterable<InputT>, BoundedWindow> reduceFn =
      SystemReduceFn.buffering(inputKvCoder.getValueCoder());

  Coder<WindowedValue<KV<K, Iterable<InputT>>>> outputCoder =
      context.getWindowedInputCoder(context.getOutput(transform));
  TypeInformation<WindowedValue<KV<K, Iterable<InputT>>>> outputTypeInfo =
      context.getTypeInfo(context.getOutput(transform));

  TupleTag<KV<K, Iterable<InputT>>> mainTag = new TupleTag<>("main output");

  String fullName = getCurrentTransformName(context);
  WindowDoFnOperator<K, InputT, Iterable<InputT>> doFnOperator =
      new WindowDoFnOperator<>(
          reduceFn,
          fullName,
          (Coder) windowedWorkItemCoder,
          mainTag,
          Collections.emptyList(),
          new DoFnOperator.MultiOutputOutputManagerFactory<>(mainTag, outputCoder),
          windowingStrategy,
          new HashMap<>(), /* side-input mapping */
          Collections.emptyList(), /* side inputs */
          context.getPipelineOptions(),
          inputKvCoder.getKeyCoder(),
          keySelector);

  // our operator expects WindowedValue<KeyedWorkItem> while our input stream
  // is WindowedValue<SingletonKeyedWorkItem>, which is fine but Java doesn't like it ...
  @SuppressWarnings("unchecked")
  SingleOutputStreamOperator<WindowedValue<KV<K, Iterable<InputT>>>> outDataStream =
      keyedWorkItemStream
          .transform(fullName, outputTypeInfo, (OneInputStreamOperator) doFnOperator)
          .uid(fullName);

  context.setOutputDataStream(context.getOutput(transform), outDataStream);
}
 
Example 18
Source File: TransformTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
  return new TransformEvaluator<GroupByKey<K, V>>() {
    @Override
    public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked")
      JavaRDD<WindowedValue<KV<K, V>>> inRDD =
          ((BoundedDataset<KV<K, V>>) context.borrowDataset(transform)).getRDD();
      final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
      @SuppressWarnings("unchecked")
      final WindowingStrategy<?, W> windowingStrategy =
          (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();

      // --- coders.
      final Coder<K> keyCoder = coder.getKeyCoder();
      final WindowedValue.WindowedValueCoder<V> wvCoder =
          WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());

      JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKey;
      Partitioner partitioner = getPartitioner(context);
      if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
        // we can have a memory sensitive translation for non-merging windows
        groupedByKey =
            GroupNonMergingWindowsFunctions.groupByKeyAndWindow(
                inRDD, keyCoder, coder.getValueCoder(), windowingStrategy, partitioner);
      } else {
        // --- group by key only.
        JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly =
            GroupCombineFunctions.groupByKeyOnly(inRDD, keyCoder, wvCoder, partitioner);

        // --- now group also by window.
        // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
        groupedByKey =
            groupedByKeyOnly.flatMap(
                new SparkGroupAlsoByWindowViaOutputBufferFn<>(
                    windowingStrategy,
                    new TranslationUtils.InMemoryStateInternalsFactory<>(),
                    SystemReduceFn.buffering(coder.getValueCoder()),
                    context.getSerializableOptions()));
      }
      context.putDataset(transform, new BoundedDataset<>(groupedByKey));
    }

    @Override
    public String toNativeString() {
      return "groupByKey()";
    }
  };
}
 
Example 19
Source File: CombinePerKeyTranslatorBatch.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void translateTransform(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
    TranslationContext context) {

  Combine.PerKey combineTransform = (Combine.PerKey) transform;
  @SuppressWarnings("unchecked")
  final PCollection<KV<K, InputT>> input = (PCollection<KV<K, InputT>>) context.getInput();
  @SuppressWarnings("unchecked")
  final PCollection<KV<K, OutputT>> output = (PCollection<KV<K, OutputT>>) context.getOutput();
  @SuppressWarnings("unchecked")
  final Combine.CombineFn<InputT, AccumT, OutputT> combineFn =
      (Combine.CombineFn<InputT, AccumT, OutputT>) combineTransform.getFn();
  WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();

  Dataset<WindowedValue<KV<K, InputT>>> inputDataset = context.getDataset(input);

  KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) input.getCoder();
  Coder<K> keyCoder = inputCoder.getKeyCoder();
  KvCoder<K, OutputT> outputKVCoder = (KvCoder<K, OutputT>) output.getCoder();
  Coder<OutputT> outputCoder = outputKVCoder.getValueCoder();

  KeyValueGroupedDataset<K, WindowedValue<KV<K, InputT>>> groupedDataset =
      inputDataset.groupByKey(KVHelpers.extractKey(), EncoderHelpers.fromBeamCoder(keyCoder));

  Coder<AccumT> accumulatorCoder = null;
  try {
    accumulatorCoder =
        combineFn.getAccumulatorCoder(
            input.getPipeline().getCoderRegistry(), inputCoder.getValueCoder());
  } catch (CannotProvideCoderException e) {
    throw new RuntimeException(e);
  }

  Dataset<Tuple2<K, Iterable<WindowedValue<OutputT>>>> combinedDataset =
      groupedDataset.agg(
          new AggregatorCombiner<K, InputT, AccumT, OutputT, BoundedWindow>(
                  combineFn, windowingStrategy, accumulatorCoder, outputCoder)
              .toColumn());

  // expand the list into separate elements and put the key back into the elements
  WindowedValue.WindowedValueCoder<KV<K, OutputT>> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(
          outputKVCoder, input.getWindowingStrategy().getWindowFn().windowCoder());
  Dataset<WindowedValue<KV<K, OutputT>>> outputDataset =
      combinedDataset.flatMap(
          (FlatMapFunction<
                  Tuple2<K, Iterable<WindowedValue<OutputT>>>, WindowedValue<KV<K, OutputT>>>)
              tuple2 -> {
                K key = tuple2._1();
                Iterable<WindowedValue<OutputT>> windowedValues = tuple2._2();
                List<WindowedValue<KV<K, OutputT>>> result = new ArrayList<>();
                for (WindowedValue<OutputT> windowedValue : windowedValues) {
                  KV<K, OutputT> kv = KV.of(key, windowedValue.getValue());
                  result.add(
                      WindowedValue.of(
                          kv,
                          windowedValue.getTimestamp(),
                          windowedValue.getWindows(),
                          windowedValue.getPane()));
                }
                return result.iterator();
              },
          EncoderHelpers.fromBeamCoder(wvCoder));
  context.putDataset(output, outputDataset);
}
 
Example 20
Source File: CombineRunners.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PrecombineRunner<KeyT, InputT, AccumT> createRunnerForPTransform(
    PipelineOptions pipelineOptions,
    BeamFnDataClient beamFnDataClient,
    BeamFnStateClient beamFnStateClient,
    BeamFnTimerClient beamFnTimerClient,
    String pTransformId,
    PTransform pTransform,
    Supplier<String> processBundleInstructionId,
    Map<String, PCollection> pCollections,
    Map<String, RunnerApi.Coder> coders,
    Map<String, RunnerApi.WindowingStrategy> windowingStrategies,
    PCollectionConsumerRegistry pCollectionConsumerRegistry,
    PTransformFunctionRegistry startFunctionRegistry,
    PTransformFunctionRegistry finishFunctionRegistry,
    Consumer<ThrowingRunnable> tearDownFunctions,
    Consumer<ProgressRequestCallback> addProgressRequestCallback,
    BundleSplitListener splitListener,
    BundleFinalizer bundleFinalizer)
    throws IOException {
  // Get objects needed to create the runner.
  RehydratedComponents rehydratedComponents =
      RehydratedComponents.forComponents(
          RunnerApi.Components.newBuilder()
              .putAllCoders(coders)
              .putAllWindowingStrategies(windowingStrategies)
              .build());
  String mainInputTag = Iterables.getOnlyElement(pTransform.getInputsMap().keySet());
  RunnerApi.PCollection mainInput = pCollections.get(pTransform.getInputsOrThrow(mainInputTag));

  // Input coder may sometimes be WindowedValueCoder depending on runner, instead of the
  // expected KvCoder.
  Coder<?> uncastInputCoder = rehydratedComponents.getCoder(mainInput.getCoderId());
  KvCoder<KeyT, InputT> inputCoder;
  if (uncastInputCoder instanceof WindowedValueCoder) {
    inputCoder =
        (KvCoder<KeyT, InputT>)
            ((WindowedValueCoder<KV<KeyT, InputT>>) uncastInputCoder).getValueCoder();
  } else {
    inputCoder = (KvCoder<KeyT, InputT>) rehydratedComponents.getCoder(mainInput.getCoderId());
  }
  Coder<KeyT> keyCoder = inputCoder.getKeyCoder();

  CombinePayload combinePayload = CombinePayload.parseFrom(pTransform.getSpec().getPayload());
  CombineFn<InputT, AccumT, ?> combineFn =
      (CombineFn)
          SerializableUtils.deserializeFromByteArray(
              combinePayload.getCombineFn().getPayload().toByteArray(), "CombineFn");
  Coder<AccumT> accumCoder =
      (Coder<AccumT>) rehydratedComponents.getCoder(combinePayload.getAccumulatorCoderId());

  FnDataReceiver<WindowedValue<KV<KeyT, AccumT>>> consumer =
      (FnDataReceiver)
          pCollectionConsumerRegistry.getMultiplexingConsumer(
              Iterables.getOnlyElement(pTransform.getOutputsMap().values()));

  PrecombineRunner<KeyT, InputT, AccumT> runner =
      new PrecombineRunner<>(pipelineOptions, combineFn, consumer, keyCoder, accumCoder);

  // Register the appropriate handlers.
  startFunctionRegistry.register(pTransformId, runner::startBundle);
  pCollectionConsumerRegistry.register(
      Iterables.getOnlyElement(pTransform.getInputsMap().values()),
      pTransformId,
      (FnDataReceiver)
          (FnDataReceiver<WindowedValue<KV<KeyT, InputT>>>) runner::processElement);
  finishFunctionRegistry.register(pTransformId, runner::finishBundle);

  return runner;
}