Java Code Examples for org.apache.beam.sdk.transforms.Combine#Globally

The following examples show how to use org.apache.beam.sdk.transforms.Combine#Globally . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CombineTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public FunctionSpec translate(
    AppliedPTransform<?, ?, Combine.Globally<?, ?>> transform, SdkComponents components)
    throws IOException {
  if (transform.getTransform().getSideInputs().isEmpty()) {
    return FunctionSpec.newBuilder()
        .setUrn(getUrn(transform.getTransform()))
        .setPayload(
            payloadForCombineGlobally((AppliedPTransform) transform, components).toByteString())
        .build();
  } else {
    // Combines with side inputs are translated as generic composites, which have a blank
    // FunctionSpec.
    return null;
  }
}
 
Example 2
Source File: CombineTranslation.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <InputT, AccumT> Coder<AccumT> extractAccumulatorCoder(
    GlobalCombineFn<InputT, AccumT, ?> combineFn,
    AppliedPTransform<PCollection<InputT>, ?, Combine.Globally<InputT, ?>> transform)
    throws IOException {
  try {
    @SuppressWarnings("unchecked")
    PCollection<InputT> mainInput =
        (PCollection<InputT>)
            Iterables.getOnlyElement(TransformInputs.nonAdditionalInputs(transform));
    return combineFn.getAccumulatorCoder(
        transform.getPipeline().getCoderRegistry(), mainInput.getCoder());
  } catch (CannotProvideCoderException e) {
    throw new IOException("Could not obtain a Coder for the accumulator", e);
  }
}
 
Example 3
Source File: CombineTranslation.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Produces a {@link RunnerApi.CombinePayload} from a {@link Combine.Globally}. */
@VisibleForTesting
static <InputT, OutputT> CombinePayload payloadForCombineGlobally(
    final AppliedPTransform<
            PCollection<InputT>, PCollection<OutputT>, Combine.Globally<InputT, OutputT>>
        transform,
    final SdkComponents components)
    throws IOException {
  GlobalCombineFn<?, ?, ?> combineFn = transform.getTransform().getFn();
  Coder<?> accumulatorCoder = extractAccumulatorCoder(combineFn, (AppliedPTransform) transform);
  return combinePayload(combineFn, accumulatorCoder, components);
}
 
Example 4
Source File: CombineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testToProto() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  input.apply(Combine.globally(combineFn));
  final AtomicReference<AppliedPTransform<?, ?, Combine.Globally<?, ?>>> combine =
      new AtomicReference<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void leaveCompositeTransform(Node node) {
          if (node.getTransform() instanceof Combine.Globally) {
            checkState(combine.get() == null);
            combine.set((AppliedPTransform) node.toAppliedPTransform(getPipeline()));
          }
        }
      });
  checkState(combine.get() != null);
  assertEquals(combineFn, combine.get().getTransform().getFn());

  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  CombinePayload combineProto =
      CombineTranslation.CombineGloballyPayloadTranslator.payloadForCombineGlobally(
          (AppliedPTransform) combine.get(), sdkComponents);
  RunnerApi.Components componentsProto = sdkComponents.toComponents();

  assertEquals(
      combineFn.getAccumulatorCoder(pipeline.getCoderRegistry(), input.getCoder()),
      getAccumulatorCoder(combineProto, RehydratedComponents.forComponents(componentsProto)));
  assertEquals(
      combineFn,
      SerializableUtils.deserializeFromByteArray(
          combineProto.getCombineFn().getPayload().toByteArray(), "CombineFn"));
}
 
Example 5
Source File: CombineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testToProtoWithoutSideInputs() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  CombineFnWithContext<Integer, int[], Integer> combineFn = new TestCombineFnWithContext();
  input.apply(Combine.globally(combineFn).withoutDefaults());
  final AtomicReference<AppliedPTransform<?, ?, Combine.Globally<?, ?>>> combine =
      new AtomicReference<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void leaveCompositeTransform(Node node) {
          if (node.getTransform() instanceof Combine.Globally) {
            checkState(combine.get() == null);
            combine.set((AppliedPTransform) node.toAppliedPTransform(getPipeline()));
          }
        }
      });
  checkState(combine.get() != null);
  assertEquals(combineFn, combine.get().getTransform().getFn());

  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  CombinePayload combineProto =
      CombineTranslation.CombineGloballyPayloadTranslator.payloadForCombineGlobally(
          (AppliedPTransform) combine.get(), sdkComponents);
  RunnerApi.Components componentsProto = sdkComponents.toComponents();

  assertEquals(
      combineFn.getAccumulatorCoder(pipeline.getCoderRegistry(), input.getCoder()),
      getAccumulatorCoder(combineProto, RehydratedComponents.forComponents(componentsProto)));
  assertEquals(
      combineFn,
      SerializableUtils.deserializeFromByteArray(
          combineProto.getCombineFn().getPayload().toByteArray(), "CombineFn"));
}
 
Example 6
Source File: CombineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testToProtoWithSideInputsFails() throws Exception {
  exception.expect(IllegalArgumentException.class);

  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  final PCollectionView<Iterable<String>> sideInputs =
      pipeline.apply(Create.of("foo")).apply(View.asIterable());

  CombineFnWithContext<Integer, int[], Integer> combineFn =
      new TestCombineFnWithContext() {
        @Override
        public Integer extractOutput(int[] accumulator, Context c) {
          Iterable<String> sideInput = c.sideInput(sideInputs);
          return accumulator[0];
        }
      };

  input.apply(Combine.globally(combineFn).withSideInputs(sideInputs).withoutDefaults());
  final AtomicReference<AppliedPTransform<?, ?, Combine.Globally<?, ?>>> combine =
      new AtomicReference<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void leaveCompositeTransform(Node node) {
          if (node.getTransform() instanceof Combine.Globally) {
            checkState(combine.get() == null);
            combine.set((AppliedPTransform) node.toAppliedPTransform(getPipeline()));
          }
        }
      });

  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  CombinePayload payload =
      CombineTranslation.CombineGloballyPayloadTranslator.payloadForCombineGlobally(
          (AppliedPTransform) combine.get(), sdkComponents);
}
 
Example 7
Source File: CombineTranslation.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public String getUrn(Combine.Globally<?, ?> transform) {
  return COMBINE_GLOBALLY_TRANSFORM_URN;
}
 
Example 8
Source File: TransformTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <InputT, AccumT, OutputT>
    TransformEvaluator<Combine.Globally<InputT, OutputT>> combineGlobally() {
  return new TransformEvaluator<Combine.Globally<InputT, OutputT>>() {

    @Override
    public void evaluate(Combine.Globally<InputT, OutputT> transform, EvaluationContext context) {
      final PCollection<InputT> input = context.getInput(transform);
      final Coder<InputT> iCoder = context.getInput(transform).getCoder();
      final Coder<OutputT> oCoder = context.getOutput(transform).getCoder();
      final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT> combineFn =
          (CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT>)
              CombineFnUtil.toFnWithContext(transform.getFn());
      final WindowedValue.FullWindowedValueCoder<OutputT> wvoCoder =
          WindowedValue.FullWindowedValueCoder.of(
              oCoder, windowingStrategy.getWindowFn().windowCoder());
      final boolean hasDefault = transform.isInsertDefault();

      final SparkCombineFn<InputT, InputT, AccumT, OutputT> sparkCombineFn =
          SparkCombineFn.globally(
              combineFn,
              context.getSerializableOptions(),
              TranslationUtils.getSideInputs(transform.getSideInputs(), context),
              windowingStrategy);
      final Coder<AccumT> aCoder;
      try {
        aCoder = combineFn.getAccumulatorCoder(context.getPipeline().getCoderRegistry(), iCoder);
      } catch (CannotProvideCoderException e) {
        throw new IllegalStateException("Could not determine coder for accumulator", e);
      }

      @SuppressWarnings("unchecked")
      JavaRDD<WindowedValue<InputT>> inRdd =
          ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();

      JavaRDD<WindowedValue<OutputT>> outRdd;

      SparkCombineFn.WindowedAccumulator<InputT, InputT, AccumT, ?> accumulated =
          GroupCombineFunctions.combineGlobally(inRdd, sparkCombineFn, aCoder, windowingStrategy);

      if (!accumulated.isEmpty()) {
        Iterable<WindowedValue<OutputT>> output = sparkCombineFn.extractOutput(accumulated);
        outRdd =
            context
                .getSparkContext()
                .parallelize(CoderHelpers.toByteArrays(output, wvoCoder))
                .map(CoderHelpers.fromByteFunction(wvoCoder));
      } else {
        // handle empty input RDD, which will naturally skip the entire execution
        // as Spark will not run on empty RDDs.
        JavaSparkContext jsc = new JavaSparkContext(inRdd.context());
        if (hasDefault) {
          OutputT defaultValue = combineFn.defaultValue();
          outRdd =
              jsc.parallelize(Lists.newArrayList(CoderHelpers.toByteArray(defaultValue, oCoder)))
                  .map(CoderHelpers.fromByteFunction(oCoder))
                  .map(WindowedValue::valueInGlobalWindow);
        } else {
          outRdd = jsc.emptyRDD();
        }
      }

      context.putDataset(transform, new BoundedDataset<>(outRdd));
    }

    @Override
    public String toNativeString() {
      return "aggregate(..., new <fn>(), ...)";
    }
  };
}
 
Example 9
Source File: HllCount.java    From beam with Apache License 2.0 2 votes vote down vote up
/**
 * Returns a {@link Combine.Globally} {@code PTransform} that takes an input {@code
 * PCollection<InputT>} and returns a {@code PCollection<byte[]>} which consists of the HLL++
 * sketch computed from the elements in the input {@code PCollection}.
 *
 * <p>Returns a singleton {@code PCollection} with an "empty sketch" (byte array of length 0)
 * if the input {@code PCollection} is empty.
 */
public Combine.Globally<InputT, byte[]> globally() {
  return Combine.globally(initFn);
}
 
Example 10
Source File: HllCount.java    From beam with Apache License 2.0 2 votes vote down vote up
/**
 * Returns a {@link Combine.Globally} {@code PTransform} that takes an input {@code
 * PCollection<byte[]>} of HLL++ sketches and returns a {@code PCollection<byte[]>} of a new
 * sketch merged from the input sketches.
 *
 * <p>Only sketches of the same type can be merged together. If incompatible sketches are
 * provided, a runtime error will occur.
 *
 * <p>If sketches of different {@code precision}s are merged, the merged sketch will get the
 * minimum precision encountered among all the input sketches.
 *
 * <p>Returns a singleton {@code PCollection} with an "empty sketch" (byte array of length 0) if
 * the input {@code PCollection} is empty.
 */
public static Combine.Globally<byte[], byte[]> globally() {
  return Combine.globally(HllCountMergePartialFn.create());
}