Java Code Examples for org.apache.beam.sdk.transforms.Combine#PerKey

The following examples show how to use org.apache.beam.sdk.transforms.Combine#PerKey . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CombineRunnersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Before
public void createPipeline() throws Exception {
  // Create pipeline with an input pCollection, combine, and output pCollection.
  TestCombineFn combineFn = new TestCombineFn();
  Combine.PerKey<String, String, Integer> combine = Combine.perKey(combineFn);

  Pipeline p = Pipeline.create();
  PCollection<KV<String, String>> inputPCollection = p.apply(Create.of(KV.of("unused", "0")));
  inputPCollection.setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
  PCollection<KV<String, Integer>> outputPCollection =
      inputPCollection.apply(TEST_COMBINE_ID, combine);
  outputPCollection.setCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()));

  // Create FnApi protos needed for the runner.
  SdkComponents sdkComponents = SdkComponents.create(p.getOptions());
  pProto = PipelineTranslation.toProto(p, sdkComponents);
  inputPCollectionId = sdkComponents.registerPCollection(inputPCollection);
  outputPCollectionId = sdkComponents.registerPCollection(outputPCollection);
  pTransform = pProto.getComponents().getTransformsOrThrow(TEST_COMBINE_ID);
}
 
Example 2
Source File: CombineTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public FunctionSpec translate(
    AppliedPTransform<?, ?, Combine.PerKey<?, ?, ?>> transform, SdkComponents components)
    throws IOException {
  if (transform.getTransform().getSideInputs().isEmpty()) {
    GlobalCombineFn<?, ?, ?> combineFn = transform.getTransform().getFn();
    Coder<?> accumulatorCoder =
        extractAccumulatorCoder(combineFn, (AppliedPTransform) transform);
    return FunctionSpec.newBuilder()
        .setUrn(getUrn(transform.getTransform()))
        .setPayload(combinePayload(combineFn, accumulatorCoder, components).toByteString())
        .build();
  } else {
    // Combines with side inputs are translated as generic composites, which have a blank
    // FunctionSpec.
    return null;
  }
}
 
Example 3
Source File: CombineTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
private static <K, InputT, AccumT> Coder<AccumT> extractAccumulatorCoder(
    GlobalCombineFn<InputT, AccumT, ?> combineFn,
    AppliedPTransform<PCollection<KV<K, InputT>>, ?, Combine.PerKey<K, InputT, ?>> transform)
    throws IOException {
  try {
    @SuppressWarnings("unchecked")
    PCollection<KV<K, InputT>> mainInput =
        (PCollection<KV<K, InputT>>)
            Iterables.getOnlyElement(TransformInputs.nonAdditionalInputs(transform));
    return combineFn.getAccumulatorCoder(
        transform.getPipeline().getCoderRegistry(),
        ((KvCoder<K, InputT>) mainInput.getCoder()).getValueCoder());
  } catch (CannotProvideCoderException e) {
    throw new IOException("Could not obtain a Coder for the accumulator", e);
  }
}
 
Example 4
Source File: GroupByKeyTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private static <K, InputT, OutputT>
    SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> getSystemReduceFn(
        PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
        Pipeline pipeline,
        KvCoder<K, InputT> kvInputCoder) {
  if (transform instanceof GroupByKey) {
    return (SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow>)
        SystemReduceFn.buffering(kvInputCoder.getValueCoder());
  } else if (transform instanceof Combine.PerKey) {
    final CombineFnBase.GlobalCombineFn<? super InputT, ?, OutputT> combineFn =
        ((Combine.PerKey) transform).getFn();
    return SystemReduceFn.combining(
        kvInputCoder.getKeyCoder(),
        AppliedCombineFn.withInputCoder(combineFn, pipeline.getCoderRegistry(), kvInputCoder));
  } else {
    throw new RuntimeException("Transform " + transform + " cannot be translated as GroupByKey.");
  }
}
 
Example 5
Source File: CombineTranslation.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public String getUrn(Combine.PerKey<?, ?, ?> transform) {
  return COMBINE_PER_KEY_TRANSFORM_URN;
}
 
Example 6
Source File: CombinePerKeyTranslatorBatch.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void translateTransform(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
    TranslationContext context) {

  Combine.PerKey combineTransform = (Combine.PerKey) transform;
  @SuppressWarnings("unchecked")
  final PCollection<KV<K, InputT>> input = (PCollection<KV<K, InputT>>) context.getInput();
  @SuppressWarnings("unchecked")
  final PCollection<KV<K, OutputT>> output = (PCollection<KV<K, OutputT>>) context.getOutput();
  @SuppressWarnings("unchecked")
  final Combine.CombineFn<InputT, AccumT, OutputT> combineFn =
      (Combine.CombineFn<InputT, AccumT, OutputT>) combineTransform.getFn();
  WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();

  Dataset<WindowedValue<KV<K, InputT>>> inputDataset = context.getDataset(input);

  KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) input.getCoder();
  Coder<K> keyCoder = inputCoder.getKeyCoder();
  KvCoder<K, OutputT> outputKVCoder = (KvCoder<K, OutputT>) output.getCoder();
  Coder<OutputT> outputCoder = outputKVCoder.getValueCoder();

  KeyValueGroupedDataset<K, WindowedValue<KV<K, InputT>>> groupedDataset =
      inputDataset.groupByKey(KVHelpers.extractKey(), EncoderHelpers.fromBeamCoder(keyCoder));

  Coder<AccumT> accumulatorCoder = null;
  try {
    accumulatorCoder =
        combineFn.getAccumulatorCoder(
            input.getPipeline().getCoderRegistry(), inputCoder.getValueCoder());
  } catch (CannotProvideCoderException e) {
    throw new RuntimeException(e);
  }

  Dataset<Tuple2<K, Iterable<WindowedValue<OutputT>>>> combinedDataset =
      groupedDataset.agg(
          new AggregatorCombiner<K, InputT, AccumT, OutputT, BoundedWindow>(
                  combineFn, windowingStrategy, accumulatorCoder, outputCoder)
              .toColumn());

  // expand the list into separate elements and put the key back into the elements
  WindowedValue.WindowedValueCoder<KV<K, OutputT>> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(
          outputKVCoder, input.getWindowingStrategy().getWindowFn().windowCoder());
  Dataset<WindowedValue<KV<K, OutputT>>> outputDataset =
      combinedDataset.flatMap(
          (FlatMapFunction<
                  Tuple2<K, Iterable<WindowedValue<OutputT>>>, WindowedValue<KV<K, OutputT>>>)
              tuple2 -> {
                K key = tuple2._1();
                Iterable<WindowedValue<OutputT>> windowedValues = tuple2._2();
                List<WindowedValue<KV<K, OutputT>>> result = new ArrayList<>();
                for (WindowedValue<OutputT> windowedValue : windowedValues) {
                  KV<K, OutputT> kv = KV.of(key, windowedValue.getValue());
                  result.add(
                      WindowedValue.of(
                          kv,
                          windowedValue.getTimestamp(),
                          windowedValue.getWindows(),
                          windowedValue.getPane()));
                }
                return result.iterator();
              },
          EncoderHelpers.fromBeamCoder(wvCoder));
  context.putDataset(output, outputDataset);
}
 
Example 7
Source File: TransformTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <K, InputT, AccumT, OutputT>
    TransformEvaluator<Combine.PerKey<K, InputT, OutputT>> combinePerKey() {
  return new TransformEvaluator<Combine.PerKey<K, InputT, OutputT>>() {
    @Override
    public void evaluate(
        Combine.PerKey<K, InputT, OutputT> transform, EvaluationContext context) {
      final PCollection<KV<K, InputT>> input = context.getInput(transform);
      // serializable arguments to pass.
      final KvCoder<K, InputT> inputCoder =
          (KvCoder<K, InputT>) context.getInput(transform).getCoder();
      @SuppressWarnings("unchecked")
      final CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT> combineFn =
          (CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT>)
              CombineFnUtil.toFnWithContext(transform.getFn());
      final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
      final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs =
          TranslationUtils.getSideInputs(transform.getSideInputs(), context);
      final SparkCombineFn<KV<K, InputT>, InputT, AccumT, OutputT> sparkCombineFn =
          SparkCombineFn.keyed(
              combineFn, context.getSerializableOptions(), sideInputs, windowingStrategy);
      final Coder<AccumT> vaCoder;
      try {
        vaCoder =
            combineFn.getAccumulatorCoder(
                context.getPipeline().getCoderRegistry(), inputCoder.getValueCoder());
      } catch (CannotProvideCoderException e) {
        throw new IllegalStateException("Could not determine coder for accumulator", e);
      }

      @SuppressWarnings("unchecked")
      JavaRDD<WindowedValue<KV<K, InputT>>> inRdd =
          ((BoundedDataset<KV<K, InputT>>) context.borrowDataset(transform)).getRDD();

      JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>>
          accumulatePerKey;
      accumulatePerKey =
          GroupCombineFunctions.combinePerKey(
              inRdd,
              sparkCombineFn,
              inputCoder.getKeyCoder(),
              inputCoder.getValueCoder(),
              vaCoder,
              windowingStrategy);

      JavaPairRDD<K, WindowedValue<OutputT>> kwvs =
          SparkCompat.extractOutput(accumulatePerKey, sparkCombineFn);
      JavaRDD<WindowedValue<KV<K, OutputT>>> outRdd =
          kwvs.map(new TranslationUtils.FromPairFunction())
              .map(new TranslationUtils.ToKVByWindowInValueFunction<>());

      context.putDataset(transform, new BoundedDataset<>(outRdd));
    }

    @Override
    public String toNativeString() {
      return "combineByKey(..., new <fn>(), ...)";
    }
  };
}
 
Example 8
Source File: HllCount.java    From beam with Apache License 2.0 2 votes vote down vote up
/**
 * Returns a {@link Combine.PerKey} {@code PTransform} that takes an input {@code
 * PCollection<KV<K, InputT>>} and returns a {@code PCollection<KV<K, byte[]>>} which consists
 * of the per-key HLL++ sketch computed from the values matching each key in the input {@code
 * PCollection}.
 */
public <K> Combine.PerKey<K, InputT, byte[]> perKey() {
  return Combine.perKey(initFn);
}
 
Example 9
Source File: HllCount.java    From beam with Apache License 2.0 2 votes vote down vote up
/**
 * Returns a {@link Combine.PerKey} {@code PTransform} that takes an input {@code
 * PCollection<KV<K, byte[]>>} of (key, HLL++ sketch) pairs and returns a {@code
 * PCollection<KV<K, byte[]>>} of (key, new sketch merged from the input sketches under the
 * key).
 *
 * <p>If sketches of different {@code precision}s are merged, the merged sketch will get the
 * minimum precision encountered among all the input sketches.
 *
 * <p>Only sketches of the same type can be merged together. If incompatible sketches are
 * provided, a runtime error will occur.
 */
public static <K> Combine.PerKey<K, byte[], byte[]> perKey() {
  return Combine.perKey(HllCountMergePartialFn.create());
}