Java Code Examples for org.apache.beam.sdk.transforms.ParDo#MultiOutput

The following examples show how to use org.apache.beam.sdk.transforms.ParDo#MultiOutput . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParDoTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
public static TupleTagList getAdditionalOutputTags(AppliedPTransform<?, ?, ?> application)
    throws IOException {
  PTransform<?, ?> transform = application.getTransform();
  if (transform instanceof ParDo.MultiOutput) {
    return ((ParDo.MultiOutput<?, ?>) transform).getAdditionalOutputTags();
  }

  RunnerApi.PTransform protoTransform =
      PTransformTranslation.toProto(
          application, SdkComponents.create(application.getPipeline().getOptions()));

  ParDoPayload payload = ParDoPayload.parseFrom(protoTransform.getSpec().getPayload());
  TupleTag<?> mainOutputTag = getMainOutputTag(payload);
  Set<String> outputTags =
      Sets.difference(
          protoTransform.getOutputsMap().keySet(), Collections.singleton(mainOutputTag.getId()));

  ArrayList<TupleTag<?>> additionalOutputTags = new ArrayList<>();
  for (String outputTag : outputTags) {
    additionalOutputTags.add(new TupleTag<>(outputTag));
  }
  return TupleTagList.of(additionalOutputTags);
}
 
Example 2
Source File: ParDoTranslationTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Parameters(name = "{index}: {0}")
public static Iterable<ParDo.MultiOutput<?, ?>> data() {
  return ImmutableList.of(
      ParDo.of(new DropElementsFn()).withOutputTags(new TupleTag<>(), TupleTagList.empty()),
      ParDo.of(new DropElementsFn())
          .withOutputTags(new TupleTag<>(), TupleTagList.empty())
          .withSideInputs(singletonSideInput, multimapSideInput),
      ParDo.of(new DropElementsFn())
          .withOutputTags(
              new TupleTag<>(),
              TupleTagList.of(new TupleTag<byte[]>() {}).and(new TupleTag<Integer>() {}))
          .withSideInputs(singletonSideInput, multimapSideInput),
      ParDo.of(new DropElementsFn())
          .withOutputTags(
              new TupleTag<>(),
              TupleTagList.of(new TupleTag<byte[]>() {}).and(new TupleTag<Integer>() {})),
      ParDo.of(new SplittableDropElementsFn())
          .withOutputTags(new TupleTag<>(), TupleTagList.empty()),
      ParDo.of(new StateTimerDropElementsFn())
          .withOutputTags(new TupleTag<>(), TupleTagList.empty()));
}
 
Example 3
Source File: ParDoTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public RunnerApi.PTransform translate(
    AppliedPTransform<?, ?, ?> appliedPTransform,
    List<AppliedPTransform<?, ?, ?>> subtransforms,
    SdkComponents components)
    throws IOException {
  RunnerApi.PTransform.Builder builder =
      PTransformTranslation.translateAppliedPTransform(
          appliedPTransform, subtransforms, components);

  AppliedPTransform<?, ?, ParDo.MultiOutput<?, ?>> appliedParDo =
      (AppliedPTransform<?, ?, ParDo.MultiOutput<?, ?>>) appliedPTransform;
  ParDoPayload payload = translateParDo(appliedParDo, components);
  builder.setSpec(
      RunnerApi.FunctionSpec.newBuilder()
          .setUrn(PAR_DO_TRANSFORM_URN)
          .setPayload(payload.toByteString())
          .build());
  builder.setEnvironmentId(components.getOnlyEnvironmentId());

  return builder.build();
}
 
Example 4
Source File: ParDoTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
public static ParDoPayload translateParDo(
    AppliedPTransform<?, ?, ParDo.MultiOutput<?, ?>> appliedPTransform, SdkComponents components)
    throws IOException {
  final ParDo.MultiOutput<?, ?> parDo = appliedPTransform.getTransform();
  final Pipeline pipeline = appliedPTransform.getPipeline();
  final DoFn<?, ?> doFn = parDo.getFn();

  // Get main input.
  Set<String> allInputs =
      appliedPTransform.getInputs().keySet().stream()
          .map(TupleTag::getId)
          .collect(Collectors.toSet());
  Set<String> sideInputs =
      parDo.getSideInputs().values().stream()
          .map(s -> s.getTagInternal().getId())
          .collect(Collectors.toSet());
  String mainInputName = Iterables.getOnlyElement(Sets.difference(allInputs, sideInputs));
  PCollection<?> mainInput =
      (PCollection<?>) appliedPTransform.getInputs().get(new TupleTag<>(mainInputName));

  final DoFnSchemaInformation doFnSchemaInformation =
      ParDo.getDoFnSchemaInformation(doFn, mainInput);
  return translateParDo(
      (ParDo.MultiOutput) parDo, mainInput, doFnSchemaInformation, pipeline, components);
}
 
Example 5
Source File: SplittableParDoTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private PCollection<String> applySplittableParDo(
    String name, PCollection<Integer> input, DoFn<Integer, String> fn) {
  ParDo.MultiOutput<Integer, String> multiOutput =
      ParDo.of(fn).withOutputTags(MAIN_OUTPUT_TAG, TupleTagList.empty());
  PCollectionTuple output = multiOutput.expand(input);
  output.get(MAIN_OUTPUT_TAG).setName("main");
  AppliedPTransform<PCollection<Integer>, PCollectionTuple, ?> transform =
      AppliedPTransform.of("ParDo", input.expand(), output.expand(), multiOutput, pipeline);
  return input.apply(name, SplittableParDo.forAppliedParDo(transform)).get(MAIN_OUTPUT_TAG);
}
 
Example 6
Source File: ParDoBoundMultiTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void translate(
    ParDo.MultiOutput<InT, OutT> transform,
    TransformHierarchy.Node node,
    TranslationContext ctx) {
  doTranslate(transform, node, ctx);
}
 
Example 7
Source File: ParDoTranslation.java    From beam with Apache License 2.0 5 votes vote down vote up
public static List<PCollectionView<?>> getSideInputs(AppliedPTransform<?, ?, ?> application)
    throws IOException {
  PTransform<?, ?> transform = application.getTransform();
  if (transform instanceof ParDo.MultiOutput) {
    return ((ParDo.MultiOutput<?, ?>) transform)
        .getSideInputs().values().stream().collect(Collectors.toList());
  }

  SdkComponents sdkComponents = SdkComponents.create(application.getPipeline().getOptions());
  RunnerApi.PTransform parDoProto = PTransformTranslation.toProto(application, sdkComponents);
  ParDoPayload payload = ParDoPayload.parseFrom(parDoProto.getSpec().getPayload());

  List<PCollectionView<?>> views = new ArrayList<>();
  RehydratedComponents components =
      RehydratedComponents.forComponents(sdkComponents.toComponents());
  for (Map.Entry<String, SideInput> sideInputEntry : payload.getSideInputsMap().entrySet()) {
    String sideInputTag = sideInputEntry.getKey();
    RunnerApi.SideInput sideInput = sideInputEntry.getValue();
    PCollection<?> originalPCollection =
        checkNotNull(
            (PCollection<?>) application.getInputs().get(new TupleTag<>(sideInputTag)),
            "no input with tag %s",
            sideInputTag);
    views.add(
        PCollectionViewTranslation.viewFromProto(
            sideInput, sideInputTag, originalPCollection, parDoProto, components));
  }
  return views;
}
 
Example 8
Source File: SplittableParDoOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PTransformReplacement<PCollection<InputT>, PCollectionTuple> getReplacementTransform(
    AppliedPTransform<PCollection<InputT>, PCollectionTuple, ParDo.MultiOutput<InputT, OutputT>>
        appliedTransform) {
  return PTransformReplacement.of(
      PTransformReplacements.getSingletonMainInput(appliedTransform),
      SplittableParDo.forAppliedParDo(appliedTransform));
}
 
Example 9
Source File: ParDoBoundMultiTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public Map<String, String> createConfig(
    ParDo.MultiOutput<InT, OutT> transform, TransformHierarchy.Node node, ConfigContext ctx) {
  final Map<String, String> config = new HashMap<>();
  final DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
  final SamzaPipelineOptions options = ctx.getPipelineOptions();

  if (signature.usesState()) {
    // set up user state configs
    for (DoFnSignature.StateDeclaration state : signature.stateDeclarations().values()) {
      final String storeId = state.id();
      config.put(
          "stores." + storeId + ".factory",
          "org.apache.samza.storage.kv.RocksDbKeyValueStorageEngineFactory");
      config.put("stores." + storeId + ".key.serde", "byteArraySerde");
      config.put("stores." + storeId + ".msg.serde", "byteSerde");

      if (options.getStateDurable()) {
        config.put(
            "stores." + storeId + ".changelog",
            ConfigBuilder.getChangelogTopic(options, storeId));
      }
    }
  }

  if (doFnInvokerRegistrar != null) {
    config.putAll(doFnInvokerRegistrar.configFor(transform.getFn()));
  }

  return config;
}
 
Example 10
Source File: BatchStatefulParDoOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransformOverrideFactory} that replaces a multi-output {@link ParDo} with a
 * composite transform specialized for the {@link DataflowRunner}.
 */
public static <K, InputT, OutputT>
    PTransformOverrideFactory<
            PCollection<KV<K, InputT>>,
            PCollectionTuple,
            ParDo.MultiOutput<KV<K, InputT>, OutputT>>
        multiOutputOverrideFactory(DataflowPipelineOptions options) {
  return new MultiOutputOverrideFactory<>(isFnApi(options));
}
 
Example 11
Source File: PTransformMatchers.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * A {@link PTransformMatcher} that matches a {@link ParDo.MultiOutput} containing a {@link DoFn}
 * that uses state or timers, as specified by {@link DoFnSignature#usesState()} and {@link
 * DoFnSignature#usesTimers()}.
 */
public static PTransformMatcher stateOrTimerParDoMulti() {
  return new PTransformMatcher() {
    @Override
    public boolean matches(AppliedPTransform<?, ?, ?> application) {
      PTransform<?, ?> transform = application.getTransform();
      if (transform instanceof ParDo.MultiOutput) {
        DoFn<?, ?> fn = ((ParDo.MultiOutput<?, ?>) transform).getFn();
        DoFnSignature signature = DoFnSignatures.signatureForDoFn(fn);
        return signature.usesState() || signature.usesTimers();
      }
      return false;
    }

    @Override
    public String toString() {
      return MoreObjects.toStringHelper("StateOrTimerParDoMultiMatcher").toString();
    }
  };
}
 
Example 12
Source File: RequiresStableInputParDoOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PTransformReplacement<PCollection<InputT>, PCollectionTuple> getReplacementTransform(
    AppliedPTransform<PCollection<InputT>, PCollectionTuple, ParDo.MultiOutput<InputT, OutputT>>
        appliedTransform) {
  return PTransformReplacement.of(
      PTransformReplacements.getSingletonMainInput(appliedTransform),
      new PTransform<PCollection<InputT>, PCollectionTuple>() {
        @Override
        public PCollectionTuple expand(PCollection<InputT> input) {
          return input
              .apply("Materialize input", Reshuffle.viaRandomKey())
              .apply("ParDo with stable input", appliedTransform.getTransform());
        }
      });
}
 
Example 13
Source File: Utils.java    From beam with Apache License 2.0 5 votes vote down vote up
static List<PCollectionView<?>> getSideInputs(AppliedPTransform<?, ?, ?> appliedTransform) {
  PTransform<?, ?> transform = appliedTransform.getTransform();
  if (transform instanceof ParDo.MultiOutput) {
    ParDo.MultiOutput multiParDo = (ParDo.MultiOutput) transform;
    return (List) multiParDo.getSideInputs().values().stream().collect(Collectors.toList());
  } else if (transform instanceof ParDo.SingleOutput) {
    ParDo.SingleOutput singleParDo = (ParDo.SingleOutput) transform;
    return (List) singleParDo.getSideInputs().values().stream().collect(Collectors.toList());
  }
  return Collections.emptyList();
}
 
Example 14
Source File: RequiresStableInputParDoOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransformOverrideFactory} that inserts a {@link Reshuffle.ViaRandomKey}
 * before a {@link ParDo.MultiOutput} that uses the {@link RequiresStableInput} annotation.
 */
static <InputT, OutputT>
    PTransformOverrideFactory<
            PCollection<InputT>, PCollectionTuple, ParDo.MultiOutput<InputT, OutputT>>
        multiOutputOverrideFactory() {
  return new MultiOutputOverrideFactory<>();
}
 
Example 15
Source File: ParDoTranslation.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public String getUrn(ParDo.MultiOutput<?, ?> transform) {
  return PAR_DO_TRANSFORM_URN;
}
 
Example 16
Source File: ParDoMultiOutputTranslatorBatch.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void translateNode(
    ParDo.MultiOutput<InputT, OutputT> transform, Twister2BatchTranslationContext context) {
  DoFn<InputT, OutputT> doFn;
  doFn = transform.getFn();
  if (DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable()) {
    throw new UnsupportedOperationException(
        String.format(
            "Not expected to directly translate splittable DoFn, should have been overridden: %s",
            doFn));
  }
  BatchTSetImpl<WindowedValue<InputT>> inputTTSet =
      context.getInputDataSet(context.getInput(transform));

  WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
  Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
  Map<String, PCollectionView<?>> sideInputMapping;

  Map<TupleTag<?>, PValue> outputs = context.getOutputs();
  Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();

  // DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
  DoFnSchemaInformation doFnSchemaInformation;
  doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
  sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());

  TupleTag<OutputT> mainOutput = transform.getMainOutputTag();
  List<TupleTag<?>> additionalOutputTags =
      new ArrayList<>(transform.getAdditionalOutputTags().getAll());
  Map<String, PCollectionView<?>> sideInputs = transform.getSideInputs();
  // TODO : note change from List to map in sideinputs

  // construct a map from side input to WindowingStrategy so that
  // the DoFn runner can map main-input windows to side input windows
  Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
  for (PCollectionView<?> sideInput : sideInputs.values()) {
    sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
  }

  TupleTag<?> mainOutputTag;
  try {
    mainOutputTag = ParDoTranslation.getMainOutputTag(context.getCurrentTransform());
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
  outputMap.put(mainOutputTag, 0);
  int count = 1;
  for (TupleTag<?> tag : outputs.keySet()) {
    if (!outputMap.containsKey(tag)) {
      outputMap.put(tag, count++);
    }
  }

  ComputeTSet<RawUnionValue, Iterator<WindowedValue<InputT>>> outputTSet =
      inputTTSet
          .direct()
          .<RawUnionValue>compute(
              new DoFnFunction<OutputT, InputT>(
                  context,
                  doFn,
                  inputCoder,
                  outputCoders,
                  additionalOutputTags,
                  windowingStrategy,
                  sideInputStrategies,
                  mainOutput,
                  doFnSchemaInformation,
                  outputMap,
                  sideInputMapping));

  for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
    ComputeTSet<WindowedValue<OutputT>, Iterator<RawUnionValue>> tempTSet =
        outputTSet.direct().compute(new OutputTagFilter(outputMap.get(output.getKey())));
    context.setOutputDataSet((PCollection) output.getValue(), tempTSet);
  }
}
 
Example 17
Source File: DataflowPipelineTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private <InputT, OutputT> void translateMultiHelper(
    ParDo.MultiOutput<InputT, OutputT> transform, TranslationContext context) {
  StepTranslationContext stepContext = context.addStep(transform, "ParallelDo");
  DoFnSchemaInformation doFnSchemaInformation;
  doFnSchemaInformation =
      ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
  Map<String, PCollectionView<?>> sideInputMapping =
      ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
  Map<TupleTag<?>, Coder<?>> outputCoders =
      context.getOutputs(transform).entrySet().stream()
          .collect(
              Collectors.toMap(
                  Map.Entry::getKey, e -> ((PCollection) e.getValue()).getCoder()));
  translateInputs(
      stepContext,
      context.getInput(transform),
      transform.getSideInputs().values(),
      context);
  translateOutputs(context.getOutputs(transform), stepContext);
  String ptransformId =
      context.getSdkComponents().getPTransformIdOrThrow(context.getCurrentTransform());
  translateFn(
      stepContext,
      ptransformId,
      transform.getFn(),
      context.getInput(transform).getWindowingStrategy(),
      transform.getSideInputs().values(),
      context.getInput(transform).getCoder(),
      context,
      transform.getMainOutputTag(),
      outputCoders,
      doFnSchemaInformation,
      sideInputMapping);

  // TODO: Move this logic into translateFn once the legacy ProcessKeyedElements is
  // removed.
  if (context.isFnApi()) {
    DoFnSignature signature = DoFnSignatures.signatureForDoFn(transform.getFn());
    if (signature.processElement().isSplittable()) {
      DoFnInvoker<?, ?> doFnInvoker = DoFnInvokers.invokerFor(transform.getFn());
      Coder<?> restrictionAndWatermarkStateCoder =
          KvCoder.of(
              doFnInvoker.invokeGetRestrictionCoder(
                  context.getInput(transform).getPipeline().getCoderRegistry()),
              doFnInvoker.invokeGetWatermarkEstimatorStateCoder(
                  context.getInput(transform).getPipeline().getCoderRegistry()));
      stepContext.addInput(
          PropertyNames.RESTRICTION_ENCODING,
          translateCoder(restrictionAndWatermarkStateCoder, context));
    }
  }
}
 
Example 18
Source File: StepTransform.java    From kettle-beam with Apache License 2.0 4 votes vote down vote up
@Override public PCollectionTuple expand( PCollection<KettleRow> input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamKettle.init( stepPluginClasses, xpPluginClasses );

    // Similar for the output : treate a TupleTag list for the target steps...
    //
    TupleTag<KettleRow> mainOutputTupleTag = new TupleTag<KettleRow>( KettleBeamUtil.createMainOutputTupleId( stepname ) ) {
    };
    List<TupleTag<KettleRow>> targetTupleTags = new ArrayList<>();
    TupleTagList targetTupleTagList = null;
    for ( String targetStep : targetSteps ) {
      String tupleId = KettleBeamUtil.createTargetTupleId( stepname, targetStep );
      TupleTag<KettleRow> tupleTag = new TupleTag<KettleRow>( tupleId ) {
      };
      targetTupleTags.add( tupleTag );
      if ( targetTupleTagList == null ) {
        targetTupleTagList = TupleTagList.of( tupleTag );
      } else {
        targetTupleTagList = targetTupleTagList.and( tupleTag );
      }
    }
    if ( targetTupleTagList == null ) {
      targetTupleTagList = TupleTagList.empty();
    }

    // Create a new step function, initializes the step
    //
    StepFn stepFn = new StepFn( variableValues, metastoreJson, stepPluginClasses, xpPluginClasses,
      stepname, stepPluginId, stepMetaInterfaceXml, inputRowMetaJson, inputStep,
      targetSteps, infoSteps, infoRowMetaJsons );

    // The actual step functionality
    //
    ParDo.SingleOutput<KettleRow, KettleRow> parDoStepFn = ParDo.of( stepFn );

    // Add optional side inputs...
    //
    if ( infoCollectionViews.size() > 0 ) {
      parDoStepFn = parDoStepFn.withSideInputs( infoCollectionViews );
    }

    // Specify the main output and targeted outputs
    //
    ParDo.MultiOutput<KettleRow, KettleRow> multiOutput = parDoStepFn.withOutputTags( mainOutputTupleTag, targetTupleTagList );

    // Apply the multi output parallel do step function to the main input stream
    //
    PCollectionTuple collectionTuple = input.apply( multiOutput );

    // In the tuple is everything we need to find.
    // Just make sure to retrieve the PCollections using the correct Tuple ID
    // Use KettleBeamUtil.createTargetTupleId()... to make sure
    //
    return collectionTuple;
  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error transforming data in step '" + stepname + "'", e );
    throw new RuntimeException( "Error transforming data in step", e );
  }

}
 
Example 19
Source File: StepBatchTransform.java    From kettle-beam with Apache License 2.0 4 votes vote down vote up
@Override public PCollectionTuple expand( PCollection<KettleRow> input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamKettle.init( stepPluginClasses, xpPluginClasses );

    // Similar for the output : treate a TupleTag list for the target steps...
    //
    TupleTag<KettleRow> mainOutputTupleTag = new TupleTag<KettleRow>( KettleBeamUtil.createMainOutputTupleId( stepname ) ) {
    };
    List<TupleTag<KettleRow>> targetTupleTags = new ArrayList<>();
    TupleTagList targetTupleTagList = null;
    for ( String targetStep : targetSteps ) {
      String tupleId = KettleBeamUtil.createTargetTupleId( stepname, targetStep );
      TupleTag<KettleRow> tupleTag = new TupleTag<KettleRow>( tupleId ) {
      };
      targetTupleTags.add( tupleTag );
      if ( targetTupleTagList == null ) {
        targetTupleTagList = TupleTagList.of( tupleTag );
      } else {
        targetTupleTagList = targetTupleTagList.and( tupleTag );
      }
    }
    if ( targetTupleTagList == null ) {
      targetTupleTagList = TupleTagList.empty();
    }

    // Create a new step function, initializes the step
    //
    StepBatchFn stepBatchFn = new StepBatchFn( variableValues, metastoreJson, stepPluginClasses, xpPluginClasses,
      stepname, stepPluginId, stepMetaInterfaceXml, inputRowMetaJson, inputStep,
      targetSteps, infoSteps, infoRowMetaJsons );

    // The actual step functionality
    //
    ParDo.SingleOutput<KettleRow, KettleRow> parDoStepFn = ParDo.of( stepBatchFn );

    // Add optional side inputs...
    //
    if ( infoCollectionViews.size() > 0 ) {
      parDoStepFn = parDoStepFn.withSideInputs( infoCollectionViews );
    }

    // Specify the main output and targeted outputs
    //
    ParDo.MultiOutput<KettleRow, KettleRow> multiOutput = parDoStepFn.withOutputTags( mainOutputTupleTag, targetTupleTagList );

    // Apply the multi output parallel do step function to the main input stream
    //
    PCollectionTuple collectionTuple = input.apply( multiOutput );

    // In the tuple is everything we need to find.
    // Just make sure to retrieve the PCollections using the correct Tuple ID
    // Use KettleBeamUtil.createTargetTupleId()... to make sure
    //
    return collectionTuple;
  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error transforming data in step '" + stepname + "'", e );
    throw new RuntimeException( "Error transforming data in step", e );
  }

}
 
Example 20
Source File: TransformBatchTransform.java    From hop with Apache License 2.0 4 votes vote down vote up
@Override public PCollectionTuple expand( PCollection<HopRow> input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamHop.init( transformPluginClasses, xpPluginClasses );

    // Similar for the output : treate a TupleTag list for the target transforms...
    //
    TupleTag<HopRow> mainOutputTupleTag = new TupleTag<HopRow>( HopBeamUtil.createMainOutputTupleId( transformName ) ) {
    };
    List<TupleTag<HopRow>> targetTupleTags = new ArrayList<>();
    TupleTagList targetTupleTagList = null;
    for ( String targetStep : targetSteps ) {
      String tupleId = HopBeamUtil.createTargetTupleId( transformName, targetStep );
      TupleTag<HopRow> tupleTag = new TupleTag<HopRow>( tupleId ) {
      };
      targetTupleTags.add( tupleTag );
      if ( targetTupleTagList == null ) {
        targetTupleTagList = TupleTagList.of( tupleTag );
      } else {
        targetTupleTagList = targetTupleTagList.and( tupleTag );
      }
    }
    if ( targetTupleTagList == null ) {
      targetTupleTagList = TupleTagList.empty();
    }

    // Create a new transform function, initializes the transform
    //
    StepBatchFn stepBatchFn = new StepBatchFn( variableValues, metastoreJson, transformPluginClasses, xpPluginClasses,
      transformName, stepPluginId, stepMetaInterfaceXml, inputRowMetaJson, inputStep,
      targetSteps, infoSteps, infoRowMetaJsons );

    // The actual transform functionality
    //
    ParDo.SingleOutput<HopRow, HopRow> parDoStepFn = ParDo.of( stepBatchFn );

    // Add optional side inputs...
    //
    if ( infoCollectionViews.size() > 0 ) {
      parDoStepFn = parDoStepFn.withSideInputs( infoCollectionViews );
    }

    // Specify the main output and targeted outputs
    //
    ParDo.MultiOutput<HopRow, HopRow> multiOutput = parDoStepFn.withOutputTags( mainOutputTupleTag, targetTupleTagList );

    // Apply the multi output parallel do transform function to the main input stream
    //
    PCollectionTuple collectionTuple = input.apply( multiOutput );

    // In the tuple is everything we need to find.
    // Just make sure to retrieve the PCollections using the correct Tuple ID
    // Use HopBeamUtil.createTargetTupleId()... to make sure
    //
    return collectionTuple;
  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error transforming data in transform '" + transformName + "'", e );
    throw new RuntimeException( "Error transforming data in transform", e );
  }

}