Java Code Examples for org.apache.beam.model.pipeline.v1.RunnerApi#PCollection

The following examples show how to use org.apache.beam.model.pipeline.v1.RunnerApi#PCollection . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PCollectionTranslationTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testEncodeDecodeCycle() throws Exception {
  // Encode
  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  RunnerApi.PCollection protoCollection =
      PCollectionTranslation.toProto(testCollection, sdkComponents);
  RehydratedComponents protoComponents =
      RehydratedComponents.forComponents(sdkComponents.toComponents());

  // Decode
  Pipeline pipeline = Pipeline.create();
  PCollection<?> decodedCollection =
      PCollectionTranslation.fromProto(protoCollection, pipeline, protoComponents);

  // Verify
  assertThat(decodedCollection.getCoder(), equalTo(testCollection.getCoder()));
  assertThat(
      decodedCollection.getWindowingStrategy(),
      equalTo(testCollection.getWindowingStrategy().fixDefaults()));
  assertThat(decodedCollection.isBounded(), equalTo(testCollection.isBounded()));
}
 
Example 2
Source File: PCollectionTranslationTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testEncodeDecodeFields() throws Exception {
  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  RunnerApi.PCollection protoCollection =
      PCollectionTranslation.toProto(testCollection, sdkComponents);
  RehydratedComponents protoComponents =
      RehydratedComponents.forComponents(sdkComponents.toComponents());
  Coder<?> decodedCoder = protoComponents.getCoder(protoCollection.getCoderId());
  WindowingStrategy<?, ?> decodedStrategy =
      protoComponents.getWindowingStrategy(protoCollection.getWindowingStrategyId());
  IsBounded decodedIsBounded = PCollectionTranslation.isBounded(protoCollection);

  assertThat(decodedCoder, equalTo(testCollection.getCoder()));
  assertThat(decodedStrategy, equalTo(testCollection.getWindowingStrategy().fixDefaults()));
  assertThat(decodedIsBounded, equalTo(testCollection.isBounded()));
}
 
Example 3
Source File: ProcessBundleDescriptors.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Patches the input coder of a stateful transform to ensure that the byte representation of a key
 * used to partition the input element at the Runner, matches the key byte representation received
 * for state requests and timers from the SDK Harness. Stateful transforms always have a KvCoder
 * as input.
 */
private static void lengthPrefixKeyCoder(
    String inputColId, Components.Builder componentsBuilder) {
  RunnerApi.PCollection pcollection = componentsBuilder.getPcollectionsOrThrow(inputColId);
  RunnerApi.Coder kvCoder = componentsBuilder.getCodersOrThrow(pcollection.getCoderId());
  Preconditions.checkState(
      ModelCoders.KV_CODER_URN.equals(kvCoder.getSpec().getUrn()),
      "Stateful executable stages must use a KV coder, but is: %s",
      kvCoder.getSpec().getUrn());
  String keyCoderId = ModelCoders.getKvCoderComponents(kvCoder).keyCoderId();
  // Retain the original coder, but wrap in LengthPrefixCoder
  String newKeyCoderId =
      LengthPrefixUnknownCoders.addLengthPrefixedCoder(keyCoderId, componentsBuilder, false);
  // Replace old key coder with LengthPrefixCoder<old_key_coder>
  kvCoder = kvCoder.toBuilder().setComponentCoderIds(0, newKeyCoderId).build();
  componentsBuilder.putCoders(pcollection.getCoderId(), kvCoder);
}
 
Example 4
Source File: PCollectionTranslation.java    From beam with Apache License 2.0 5 votes vote down vote up
public static RunnerApi.PCollection toProto(PCollection<?> pCollection, SdkComponents components)
    throws IOException {
  String coderId = components.registerCoder(pCollection.getCoder());
  String windowingStrategyId =
      components.registerWindowingStrategy(pCollection.getWindowingStrategy());

  return RunnerApi.PCollection.newBuilder()
      .setUniqueName(pCollection.getName())
      .setCoderId(coderId)
      .setIsBounded(toProto(pCollection.isBounded()))
      .setWindowingStrategyId(windowingStrategyId)
      .build();
}
 
Example 5
Source File: PCollectionTranslation.java    From beam with Apache License 2.0 5 votes vote down vote up
public static PCollection<?> fromProto(
    RunnerApi.PCollection pCollection, Pipeline pipeline, RehydratedComponents components)
    throws IOException {

  Coder<?> coder = components.getCoder(pCollection.getCoderId());
  return PCollection.createPrimitiveOutputInternal(
      pipeline,
      components.getWindowingStrategy(pCollection.getWindowingStrategyId()),
      fromProto(pCollection.getIsBounded()),
      (Coder) coder);
}
 
Example 6
Source File: ParDoTranslation.java    From beam with Apache License 2.0 5 votes vote down vote up
public static RunnerApi.PCollection getMainInput(
    RunnerApi.PTransform ptransform, Components components) throws IOException {
  checkArgument(
      PAR_DO_TRANSFORM_URN.equals(ptransform.getSpec().getUrn())
          || SPLITTABLE_PAIR_WITH_RESTRICTION_URN.equals(ptransform.getSpec().getUrn())
          || SPLITTABLE_SPLIT_AND_SIZE_RESTRICTIONS_URN.equals(ptransform.getSpec().getUrn())
          || SPLITTABLE_PROCESS_ELEMENTS_URN.equals(ptransform.getSpec().getUrn())
          || SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN.equals(
              ptransform.getSpec().getUrn()),
      "Unexpected payload type %s",
      ptransform.getSpec().getUrn());
  return components.getPcollectionsOrThrow(
      ptransform.getInputsOrThrow(getMainInputName(ptransform)));
}
 
Example 7
Source File: GreedyStageFuser.java    From beam with Apache License 2.0 5 votes vote down vote up
private static boolean anyInputsSideInputs(PTransformNode consumer, QueryablePipeline pipeline) {
  for (String inputPCollectionId : consumer.getTransform().getInputsMap().values()) {
    RunnerApi.PCollection pCollection =
        pipeline.getComponents().getPcollectionsMap().get(inputPCollectionId);
    PCollectionNode pCollectionNode = PipelineNode.pCollection(inputPCollectionId, pCollection);
    if (!pipeline.getSingletonConsumers(pCollectionNode).isEmpty()) {
      return true;
    }
  }
  return false;
}
 
Example 8
Source File: RegisterNodeFunction.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Returns an artificial PCollectionView that can be used to fulfill API requirements of a {@link
 * SideInputReader} when used inside the Dataflow runner harness.
 *
 * <p>Generates length prefixed coder variants suitable to be used within the Dataflow Runner
 * harness so that encoding and decoding values matches the length prefixing that occurred when
 * materializing the side input.
 */
public static final PCollectionView<?> transformSideInputForRunner(
    RunnerApi.Pipeline pipeline,
    RunnerApi.PTransform parDoPTransform,
    String sideInputTag,
    RunnerApi.SideInput sideInput) {
  checkArgument(
      Materializations.MULTIMAP_MATERIALIZATION_URN.equals(sideInput.getAccessPattern().getUrn()),
      "This handler is only capable of dealing with %s materializations "
          + "but was asked to handle %s for PCollectionView with tag %s.",
      Materializations.MULTIMAP_MATERIALIZATION_URN,
      sideInput.getAccessPattern().getUrn(),
      sideInputTag);
  String sideInputPCollectionId = parDoPTransform.getInputsOrThrow(sideInputTag);
  RunnerApi.PCollection sideInputPCollection =
      pipeline.getComponents().getPcollectionsOrThrow(sideInputPCollectionId);
  try {
    FullWindowedValueCoder<KV<Object, Object>> runnerSideInputCoder =
        (FullWindowedValueCoder)
            WireCoders.instantiateRunnerWireCoder(
                PipelineNode.pCollection(sideInputPCollectionId, sideInputPCollection),
                pipeline.getComponents());

    return DataflowPortabilityPCollectionView.with(
        new TupleTag<>(sideInputTag), runnerSideInputCoder);
  } catch (IOException e) {
    throw new IllegalStateException("Unable to translate proto to coder", e);
  }
}
 
Example 9
Source File: CombineRunners.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PrecombineRunner<KeyT, InputT, AccumT> createRunnerForPTransform(
    PipelineOptions pipelineOptions,
    BeamFnDataClient beamFnDataClient,
    BeamFnStateClient beamFnStateClient,
    BeamFnTimerClient beamFnTimerClient,
    String pTransformId,
    PTransform pTransform,
    Supplier<String> processBundleInstructionId,
    Map<String, PCollection> pCollections,
    Map<String, RunnerApi.Coder> coders,
    Map<String, RunnerApi.WindowingStrategy> windowingStrategies,
    PCollectionConsumerRegistry pCollectionConsumerRegistry,
    PTransformFunctionRegistry startFunctionRegistry,
    PTransformFunctionRegistry finishFunctionRegistry,
    Consumer<ThrowingRunnable> tearDownFunctions,
    Consumer<ProgressRequestCallback> addProgressRequestCallback,
    BundleSplitListener splitListener,
    BundleFinalizer bundleFinalizer)
    throws IOException {
  // Get objects needed to create the runner.
  RehydratedComponents rehydratedComponents =
      RehydratedComponents.forComponents(
          RunnerApi.Components.newBuilder()
              .putAllCoders(coders)
              .putAllWindowingStrategies(windowingStrategies)
              .build());
  String mainInputTag = Iterables.getOnlyElement(pTransform.getInputsMap().keySet());
  RunnerApi.PCollection mainInput = pCollections.get(pTransform.getInputsOrThrow(mainInputTag));

  // Input coder may sometimes be WindowedValueCoder depending on runner, instead of the
  // expected KvCoder.
  Coder<?> uncastInputCoder = rehydratedComponents.getCoder(mainInput.getCoderId());
  KvCoder<KeyT, InputT> inputCoder;
  if (uncastInputCoder instanceof WindowedValueCoder) {
    inputCoder =
        (KvCoder<KeyT, InputT>)
            ((WindowedValueCoder<KV<KeyT, InputT>>) uncastInputCoder).getValueCoder();
  } else {
    inputCoder = (KvCoder<KeyT, InputT>) rehydratedComponents.getCoder(mainInput.getCoderId());
  }
  Coder<KeyT> keyCoder = inputCoder.getKeyCoder();

  CombinePayload combinePayload = CombinePayload.parseFrom(pTransform.getSpec().getPayload());
  CombineFn<InputT, AccumT, ?> combineFn =
      (CombineFn)
          SerializableUtils.deserializeFromByteArray(
              combinePayload.getCombineFn().getPayload().toByteArray(), "CombineFn");
  Coder<AccumT> accumCoder =
      (Coder<AccumT>) rehydratedComponents.getCoder(combinePayload.getAccumulatorCoderId());

  FnDataReceiver<WindowedValue<KV<KeyT, AccumT>>> consumer =
      (FnDataReceiver)
          pCollectionConsumerRegistry.getMultiplexingConsumer(
              Iterables.getOnlyElement(pTransform.getOutputsMap().values()));

  PrecombineRunner<KeyT, InputT, AccumT> runner =
      new PrecombineRunner<>(pipelineOptions, combineFn, consumer, keyCoder, accumCoder);

  // Register the appropriate handlers.
  startFunctionRegistry.register(pTransformId, runner::startBundle);
  pCollectionConsumerRegistry.register(
      Iterables.getOnlyElement(pTransform.getInputsMap().values()),
      pTransformId,
      (FnDataReceiver)
          (FnDataReceiver<WindowedValue<KV<KeyT, InputT>>>) runner::processElement);
  finishFunctionRegistry.register(pTransformId, runner::finishBundle);

  return runner;
}
 
Example 10
Source File: PCollectionTranslation.java    From beam with Apache License 2.0 4 votes vote down vote up
public static IsBounded isBounded(RunnerApi.PCollection pCollection) {
  return fromProto(pCollection.getIsBounded());
}
 
Example 11
Source File: ParDoBoundMultiTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <InT, OutT> void doTranslatePortable(
    PipelineNode.PTransformNode transform,
    QueryablePipeline pipeline,
    PortableTranslationContext ctx) {
  Map<String, String> outputs = transform.getTransform().getOutputsMap();

  final RunnerApi.ExecutableStagePayload stagePayload;
  try {
    stagePayload =
        RunnerApi.ExecutableStagePayload.parseFrom(
            transform.getTransform().getSpec().getPayload());
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  String inputId = stagePayload.getInput();
  final MessageStream<OpMessage<InT>> inputStream = ctx.getMessageStreamById(inputId);
  // TODO: support side input
  final List<MessageStream<OpMessage<InT>>> sideInputStreams = Collections.emptyList();

  final Map<TupleTag<?>, Integer> tagToIndexMap = new HashMap<>();
  final Map<String, TupleTag<?>> idToTupleTagMap = new HashMap<>();

  // first output as the main output
  final TupleTag<OutT> mainOutputTag =
      outputs.isEmpty() ? null : new TupleTag(outputs.keySet().iterator().next());

  AtomicInteger index = new AtomicInteger(0);
  outputs
      .keySet()
      .iterator()
      .forEachRemaining(
          outputName -> {
            TupleTag<?> tupleTag = new TupleTag<>(outputName);
            tagToIndexMap.put(tupleTag, index.get());
            index.incrementAndGet();
            String collectionId = outputs.get(outputName);
            idToTupleTagMap.put(collectionId, tupleTag);
          });

  WindowedValue.WindowedValueCoder<InT> windowedInputCoder =
      ctx.instantiateCoder(inputId, pipeline.getComponents());

  final DoFnSchemaInformation doFnSchemaInformation;
  doFnSchemaInformation = ParDoTranslation.getSchemaInformation(transform.getTransform());

  Map<String, PCollectionView<?>> sideInputMapping =
      ParDoTranslation.getSideInputMapping(transform.getTransform());

  final RunnerApi.PCollection input = pipeline.getComponents().getPcollectionsOrThrow(inputId);
  final PCollection.IsBounded isBounded = SamzaPipelineTranslatorUtils.isBounded(input);

  final DoFnOp<InT, OutT, RawUnionValue> op =
      new DoFnOp<>(
          mainOutputTag,
          new NoOpDoFn<>(),
          null, // key coder not in use
          windowedInputCoder.getValueCoder(), // input coder not in use
          windowedInputCoder,
          Collections.emptyMap(), // output coders not in use
          Collections.emptyList(), // sideInputs not in use until side input support
          new ArrayList<>(idToTupleTagMap.values()), // used by java runner only
          SamzaPipelineTranslatorUtils.getPortableWindowStrategy(transform, pipeline),
          Collections.emptyMap(), // idToViewMap not in use until side input support
          new DoFnOp.MultiOutputManagerFactory(tagToIndexMap),
          ctx.getTransformFullName(),
          ctx.getTransformId(),
          isBounded,
          true,
          stagePayload,
          idToTupleTagMap,
          doFnSchemaInformation,
          sideInputMapping);

  final MessageStream<OpMessage<InT>> mergedStreams;
  if (sideInputStreams.isEmpty()) {
    mergedStreams = inputStream;
  } else {
    MessageStream<OpMessage<InT>> mergedSideInputStreams =
        MessageStream.mergeAll(sideInputStreams).flatMap(new SideInputWatermarkFn());
    mergedStreams = inputStream.merge(Collections.singletonList(mergedSideInputStreams));
  }

  final MessageStream<OpMessage<RawUnionValue>> taggedOutputStream =
      mergedStreams.flatMap(OpAdapter.adapt(op));

  for (int outputIndex : tagToIndexMap.values()) {
    final MessageStream<OpMessage<OutT>> outputStream =
        taggedOutputStream
            .filter(
                message ->
                    message.getType() != OpMessage.Type.ELEMENT
                        || message.getElement().getValue().getUnionTag() == outputIndex)
            .flatMap(OpAdapter.adapt(new RawUnionValueToValue()));

    ctx.registerMessageStream(ctx.getOutputId(transform), outputStream);
  }
}
 
Example 12
Source File: GroupByKeyTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <K, InputT, OutputT> void doTranslatePortable(
    PipelineNode.PTransformNode transform,
    QueryablePipeline pipeline,
    PortableTranslationContext ctx) {
  final MessageStream<OpMessage<KV<K, InputT>>> inputStream =
      ctx.getOneInputMessageStream(transform);
  final boolean needRepartition = ctx.getSamzaPipelineOptions().getMaxSourceParallelism() > 1;
  final WindowingStrategy<?, BoundedWindow> windowingStrategy =
      ctx.getPortableWindowStrategy(transform, pipeline);
  final Coder<BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();

  final String inputId = ctx.getInputId(transform);
  final WindowedValue.WindowedValueCoder<KV<K, InputT>> windowedInputCoder =
      ctx.instantiateCoder(inputId, pipeline.getComponents());
  final KvCoder<K, InputT> kvInputCoder = (KvCoder<K, InputT>) windowedInputCoder.getValueCoder();
  final Coder<WindowedValue<KV<K, InputT>>> elementCoder =
      WindowedValue.FullWindowedValueCoder.of(kvInputCoder, windowCoder);

  final TupleTag<KV<K, OutputT>> outputTag =
      new TupleTag<>(Iterables.getOnlyElement(transform.getTransform().getOutputsMap().keySet()));

  @SuppressWarnings("unchecked")
  final SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> reduceFn =
      (SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow>)
          SystemReduceFn.buffering(kvInputCoder.getValueCoder());

  final RunnerApi.PCollection input = pipeline.getComponents().getPcollectionsOrThrow(inputId);
  final PCollection.IsBounded isBounded = SamzaPipelineTranslatorUtils.isBounded(input);

  final MessageStream<OpMessage<KV<K, OutputT>>> outputStream =
      doTranslateGBK(
          inputStream,
          needRepartition,
          reduceFn,
          windowingStrategy,
          kvInputCoder,
          elementCoder,
          ctx.getTransformFullName(),
          ctx.getTransformId(),
          outputTag,
          isBounded);
  ctx.registerMessageStream(ctx.getOutputId(transform), outputStream);
}
 
Example 13
Source File: SamzaPipelineTranslatorUtils.java    From beam with Apache License 2.0 4 votes vote down vote up
public static PCollection.IsBounded isBounded(RunnerApi.PCollection pCollection) {
  return pCollection.getIsBounded() == RunnerApi.IsBounded.Enum.BOUNDED
      ? PCollection.IsBounded.BOUNDED
      : PCollection.IsBounded.UNBOUNDED;
}
 
Example 14
Source File: RegisterNodeFunction.java    From beam with Apache License 2.0 4 votes vote down vote up
/**
 * Modifies the process bundle descriptor and updates the PTransform that the SDK harness will see
 * with length prefixed coders used on the side input PCollection and windowing strategy.
 */
private static final void transformSideInputForSdk(
    RunnerApi.Pipeline pipeline,
    RunnerApi.PTransform originalPTransform,
    String sideInputTag,
    ProcessBundleDescriptor.Builder processBundleDescriptor,
    RunnerApi.PTransform.Builder updatedPTransform) {

  RunnerApi.PCollection sideInputPCollection =
      pipeline
          .getComponents()
          .getPcollectionsOrThrow(originalPTransform.getInputsOrThrow(sideInputTag));
  RunnerApi.WindowingStrategy sideInputWindowingStrategy =
      pipeline
          .getComponents()
          .getWindowingStrategiesOrThrow(sideInputPCollection.getWindowingStrategyId());

  // TODO: We should not length prefix the window or key for the SDK side since the
  // key and window are already length delimited via protobuf itself. But we need to
  // maintain the length prefixing within the Runner harness to match the bytes that were
  // materialized to the side input sink.

  // We take the original pipeline coders and add any coders we have added when processing side
  // inputs before building new length prefixed variants.
  RunnerApi.Components.Builder componentsBuilder = pipeline.getComponents().toBuilder();
  componentsBuilder.putAllCoders(processBundleDescriptor.getCodersMap());

  String updatedSdkSideInputCoderId =
      LengthPrefixUnknownCoders.addLengthPrefixedCoder(
          sideInputPCollection.getCoderId(), componentsBuilder, false);
  String updatedSdkSideInputWindowCoderId =
      LengthPrefixUnknownCoders.addLengthPrefixedCoder(
          sideInputWindowingStrategy.getWindowCoderId(), componentsBuilder, false);

  processBundleDescriptor.putAllCoders(componentsBuilder.getCodersMap());
  String updatedSdkWindowingStrategyId =
      SyntheticComponents.uniqueId(
          sideInputPCollection.getWindowingStrategyId() + "-runner_generated",
          processBundleDescriptor.getWindowingStrategiesMap().keySet()::contains);
  processBundleDescriptor.putWindowingStrategies(
      updatedSdkWindowingStrategyId,
      sideInputWindowingStrategy
          .toBuilder()
          .setWindowCoderId(updatedSdkSideInputWindowCoderId)
          .build());
  RunnerApi.PCollection updatedSdkSideInputPcollection =
      sideInputPCollection
          .toBuilder()
          .setCoderId(updatedSdkSideInputCoderId)
          .setWindowingStrategyId(updatedSdkWindowingStrategyId)
          .build();

  // Replace the contents of the PCollection with the updated side input PCollection
  // specification and insert it into the update PTransform.
  processBundleDescriptor.putPcollections(
      originalPTransform.getInputsOrThrow(sideInputTag), updatedSdkSideInputPcollection);
  updatedPTransform.putInputs(sideInputTag, originalPTransform.getInputsOrThrow(sideInputTag));
}