Java Code Examples for org.apache.beam.sdk.transforms.ParDo#SingleOutput

The following examples show how to use org.apache.beam.sdk.transforms.ParDo#SingleOutput . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RequiresStableInputParDoOverrides.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PTransformReplacement<PCollection<InputT>, PCollection<OutputT>> getReplacementTransform(
    AppliedPTransform<
            PCollection<InputT>, PCollection<OutputT>, ParDo.SingleOutput<InputT, OutputT>>
        appliedTransform) {
  return PTransformReplacement.of(
      PTransformReplacements.getSingletonMainInput(appliedTransform),
      new PTransform<PCollection<InputT>, PCollection<OutputT>>() {
        @Override
        public PCollection<OutputT> expand(PCollection<InputT> input) {
          return input
              .apply("Materialize input", Reshuffle.viaRandomKey())
              .apply("ParDo with stable input", appliedTransform.getTransform());
        }
      });
}
 
Example 2
Source File: PTransformMatchers.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * A {@link PTransformMatcher} that matches a {@link ParDo.SingleOutput} containing a {@link DoFn}
 * that is splittable, as signified by {@link ProcessElementMethod#isSplittable()}.
 */
public static PTransformMatcher splittableParDoSingle() {
  return new PTransformMatcher() {
    @Override
    public boolean matches(AppliedPTransform<?, ?, ?> application) {
      PTransform<?, ?> transform = application.getTransform();
      if (transform instanceof ParDo.SingleOutput) {
        DoFn<?, ?> fn = ((ParDo.SingleOutput<?, ?>) transform).getFn();
        DoFnSignature signature = DoFnSignatures.signatureForDoFn(fn);
        return signature.processElement().isSplittable();
      }
      return false;
    }

    @Override
    public String toString() {
      return MoreObjects.toStringHelper("SplittableParDoSingleMatcher").toString();
    }
  };
}
 
Example 3
Source File: NexmarkUtils.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Return a transform to keep the CPU busy for given milliseconds on every record. */
public static <T> ParDo.SingleOutput<T, T> cpuDelay(String name, final long delayMs) {
  return ParDo.of(
      new DoFn<T, T>() {
        @ProcessElement
        public void processElement(ProcessContext c) {
          long now = System.currentTimeMillis();
          long end = now + delayMs;
          while (now < end) {
            // Find plaintext which hashes to HASH in lowest MASK bits.
            // Values chosen to roughly take 1ms on typical workstation.
            long p = INIT_PLAINTEXT;
            while (true) {
              long t = Hashing.murmur3_128().hashLong(p).asLong();
              if ((t & MASK) == (HASH & MASK)) {
                break;
              }
              p++;
            }
            now = System.currentTimeMillis();
          }
          c.output(c.element());
        }
      });
}
 
Example 4
Source File: AnnotateImages.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Applies all necessary transforms to call the Vision API. In order to group requests into
 * batches, we assign keys to the requests, as {@link GroupIntoBatches} works only on {@link KV}s.
 */
@Override
public PCollection<List<AnnotateImageResponse>> expand(PCollection<T> input) {
  ParDo.SingleOutput<T, AnnotateImageRequest> inputToRequestMapper;
  if (contextSideInput != null) {
    inputToRequestMapper =
        ParDo.of(new MapInputToRequest(contextSideInput)).withSideInputs(contextSideInput);
  } else {
    inputToRequestMapper = ParDo.of(new MapInputToRequest(null));
  }
  return input
      .apply(inputToRequestMapper)
      .apply(
          WithKeys.of(
                  (SerializableFunction<AnnotateImageRequest, Integer>)
                      ignored -> new Random().nextInt(desiredRequestParallelism))
              .withKeyType(TypeDescriptors.integers()))
      .apply(GroupIntoBatches.ofSize(batchSize))
      .apply(ParDo.of(new PerformImageAnnotation()));
}
 
Example 5
Source File: PrimitiveParDoSingleFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void getReplacementTransformGetFn() {
  DoFn<Integer, Long> originalFn = new ToLongFn();
  ParDo.SingleOutput<Integer, Long> originalTransform = ParDo.of(originalFn);
  PCollection<? extends Integer> input = pipeline.apply(Create.of(1, 2, 3));
  AppliedPTransform<
          PCollection<? extends Integer>, PCollection<Long>, ParDo.SingleOutput<Integer, Long>>
      application =
          AppliedPTransform.of(
              "original",
              input.expand(),
              input.apply(originalTransform).expand(),
              originalTransform,
              pipeline);

  PTransformReplacement<PCollection<? extends Integer>, PCollection<Long>> replacementTransform =
      factory.getReplacementTransform(application);
  ParDoSingle<Integer, Long> parDoSingle =
      (ParDoSingle<Integer, Long>) replacementTransform.getTransform();

  assertThat(parDoSingle.getFn(), equalTo(originalTransform.getFn()));
  assertThat(parDoSingle.getFn(), equalTo(originalFn));
}
 
Example 6
Source File: NexmarkUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Return a transform to log each element, passing it through unchanged. */
public static <T> ParDo.SingleOutput<T, T> log(final String name) {
  return ParDo.of(
      new DoFn<T, T>() {
        @ProcessElement
        public void processElement(ProcessContext c) {
          LOG.info("%s: %s", name, c.element());
          c.output(c.element());
        }
      });
}
 
Example 7
Source File: SparkPipelineStateTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private ParDo.SingleOutput<String, String> printParDo(final String prefix) {
  return ParDo.of(
      new DoFn<String, String>() {

        @ProcessElement
        public void processElement(final ProcessContext c) {
          System.out.println(prefix + " " + c.element());
        }
      });
}
 
Example 8
Source File: SnowflakeIO.java    From beam with Apache License 2.0 5 votes vote down vote up
private ParDo.SingleOutput<Object, Object> copyToTable(
    SnowflakeService snowflakeService, String stagingBucketDir) {
  return ParDo.of(
      new CopyToTableFn<>(
          getDataSourceProviderFn(),
          getTable(),
          getQuery(),
          stagingBucketDir,
          getStorageIntegrationName(),
          getWriteDisposition(),
          snowflakeService));
}
 
Example 9
Source File: NexmarkUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Return a transform to cast each element to {@link KnownSize}. */
private static <T extends KnownSize> ParDo.SingleOutput<T, KnownSize> castToKnownSize() {
  return ParDo.of(
      new DoFn<T, KnownSize>() {
        @ProcessElement
        public void processElement(ProcessContext c) {
          c.output(c.element());
        }
      });
}
 
Example 10
Source File: NexmarkUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Return a transform to make explicit the timestamp of each element. */
public static <T> ParDo.SingleOutput<T, TimestampedValue<T>> stamp(String name) {
  return ParDo.of(
      new DoFn<T, TimestampedValue<T>>() {
        @ProcessElement
        public void processElement(ProcessContext c) {
          c.output(TimestampedValue.of(c.element(), c.timestamp()));
        }
      });
}
 
Example 11
Source File: NexmarkUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Return a transform to pass-through events, but count them as they go by. */
public static ParDo.SingleOutput<Event, Event> snoop(final String name) {
  return ParDo.of(
      new DoFn<Event, Event>() {
        final Counter eventCounter = Metrics.counter(name, "events");
        final Counter newPersonCounter = Metrics.counter(name, "newPersons");
        final Counter newAuctionCounter = Metrics.counter(name, "newAuctions");
        final Counter bidCounter = Metrics.counter(name, "bids");
        final Counter endOfStreamCounter = Metrics.counter(name, "endOfStream");

        @ProcessElement
        public void processElement(ProcessContext c) {
          eventCounter.inc();
          if (c.element().newPerson != null) {
            newPersonCounter.inc();
          } else if (c.element().newAuction != null) {
            newAuctionCounter.inc();
          } else if (c.element().bid != null) {
            bidCounter.inc();
          } else {
            endOfStreamCounter.inc();
          }
          info("%s snooping element %s", name, c.element());
          c.output(c.element());
        }
      });
}
 
Example 12
Source File: TalendIOTest.java    From component-runtime with Apache License 2.0 5 votes vote down vote up
private ParDo.SingleOutput<Sample, Record> toRecord() {
    return ParDo.of(new DoFn<Sample, Record>() {

        @ProcessElement
        public void toData(final ProcessContext sample) {
            final Sample element = sample.element();
            final RecordBuilderFactory builderFactory = new AvroRecordBuilderFactoryProvider().apply(null);
            sample.output(builderFactory.newRecordBuilder().withString("data", element.getData()).build());
        }
    });
}
 
Example 13
Source File: StructuredStreamingPipelineStateTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private ParDo.SingleOutput<String, String> printParDo(final String prefix) {
  return ParDo.of(
      new DoFn<String, String>() {

        @ProcessElement
        public void processElement(final ProcessContext c) {
          System.out.println(prefix + " " + c.element());
        }
      });
}
 
Example 14
Source File: PrimitiveParDoSingleFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void getReplacementTransformGetSideInputs() {
  PCollectionView<Long> sideLong =
      pipeline
          .apply("LongSideInputVals", Create.of(-1L, -2L, -4L))
          .apply("SideLongView", Sum.longsGlobally().asSingletonView());
  PCollectionView<List<String>> sideStrings =
      pipeline
          .apply("StringSideInputVals", Create.of("foo", "bar", "baz"))
          .apply("SideStringsView", View.asList());
  ParDo.SingleOutput<Integer, Long> originalTransform =
      ParDo.of(new ToLongFn()).withSideInputs(sideLong, sideStrings);

  PCollection<? extends Integer> input = pipeline.apply(Create.of(1, 2, 3));
  AppliedPTransform<
          PCollection<? extends Integer>, PCollection<Long>, ParDo.SingleOutput<Integer, Long>>
      application =
          AppliedPTransform.of(
              "original",
              input.expand(),
              input.apply(originalTransform).expand(),
              originalTransform,
              pipeline);

  PTransformReplacement<PCollection<? extends Integer>, PCollection<Long>> replacementTransform =
      factory.getReplacementTransform(application);
  ParDoSingle<Integer, Long> parDoSingle =
      (ParDoSingle<Integer, Long>) replacementTransform.getTransform();
  assertThat(parDoSingle.getSideInputs().values(), containsInAnyOrder(sideStrings, sideLong));
}
 
Example 15
Source File: Utils.java    From beam with Apache License 2.0 5 votes vote down vote up
static List<PCollectionView<?>> getSideInputs(AppliedPTransform<?, ?, ?> appliedTransform) {
  PTransform<?, ?> transform = appliedTransform.getTransform();
  if (transform instanceof ParDo.MultiOutput) {
    ParDo.MultiOutput multiParDo = (ParDo.MultiOutput) transform;
    return (List) multiParDo.getSideInputs().values().stream().collect(Collectors.toList());
  } else if (transform instanceof ParDo.SingleOutput) {
    ParDo.SingleOutput singleParDo = (ParDo.SingleOutput) transform;
    return (List) singleParDo.getSideInputs().values().stream().collect(Collectors.toList());
  }
  return Collections.emptyList();
}
 
Example 16
Source File: TransformTransform.java    From hop with Apache License 2.0 4 votes vote down vote up
@Override public PCollectionTuple expand( PCollection<HopRow> input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamHop.init( transformPluginClasses, xpPluginClasses );

    // Similar for the output : treate a TupleTag list for the target transforms...
    //
    TupleTag<HopRow> mainOutputTupleTag = new TupleTag<HopRow>( HopBeamUtil.createMainOutputTupleId( transformName ) ) {
    };
    List<TupleTag<HopRow>> targetTupleTags = new ArrayList<>();
    TupleTagList targetTupleTagList = null;
    for ( String targetStep : targetSteps ) {
      String tupleId = HopBeamUtil.createTargetTupleId( transformName, targetStep );
      TupleTag<HopRow> tupleTag = new TupleTag<HopRow>( tupleId ) {
      };
      targetTupleTags.add( tupleTag );
      if ( targetTupleTagList == null ) {
        targetTupleTagList = TupleTagList.of( tupleTag );
      } else {
        targetTupleTagList = targetTupleTagList.and( tupleTag );
      }
    }
    if ( targetTupleTagList == null ) {
      targetTupleTagList = TupleTagList.empty();
    }

    // Create a new transform function, initializes the transform
    //
    StepFn stepFn = new StepFn( variableValues, metastoreJson, transformPluginClasses, xpPluginClasses,
      transformName, stepPluginId, stepMetaInterfaceXml, inputRowMetaJson, inputStep,
      targetSteps, infoSteps, infoRowMetaJsons );

    // The actual transform functionality
    //
    ParDo.SingleOutput<HopRow, HopRow> parDoStepFn = ParDo.of( stepFn );

    // Add optional side inputs...
    //
    if ( infoCollectionViews.size() > 0 ) {
      parDoStepFn = parDoStepFn.withSideInputs( infoCollectionViews );
    }

    // Specify the main output and targeted outputs
    //
    ParDo.MultiOutput<HopRow, HopRow> multiOutput = parDoStepFn.withOutputTags( mainOutputTupleTag, targetTupleTagList );

    // Apply the multi output parallel do transform function to the main input stream
    //
    PCollectionTuple collectionTuple = input.apply( multiOutput );

    // In the tuple is everything we need to find.
    // Just make sure to retrieve the PCollections using the correct Tuple ID
    // Use HopBeamUtil.createTargetTupleId()... to make sure
    //
    return collectionTuple;
  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error transforming data in transform '" + transformName + "'", e );
    throw new RuntimeException( "Error transforming data in transform", e );
  }

}
 
Example 17
Source File: BatchStatefulParDoOverrides.java    From beam with Apache License 2.0 4 votes vote down vote up
ParDo.SingleOutput<KV<K, InputT>, OutputT> getOriginalParDo() {
  return originalParDo;
}
 
Example 18
Source File: SplittableParDoOverrides.java    From beam with Apache License 2.0 4 votes vote down vote up
public ParDoSingleViaMulti(
    DataflowRunner ignored, ParDo.SingleOutput<InputT, OutputT> original) {
  this.original = original;
}
 
Example 19
Source File: StepTransform.java    From kettle-beam with Apache License 2.0 4 votes vote down vote up
@Override public PCollectionTuple expand( PCollection<KettleRow> input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamKettle.init( stepPluginClasses, xpPluginClasses );

    // Similar for the output : treate a TupleTag list for the target steps...
    //
    TupleTag<KettleRow> mainOutputTupleTag = new TupleTag<KettleRow>( KettleBeamUtil.createMainOutputTupleId( stepname ) ) {
    };
    List<TupleTag<KettleRow>> targetTupleTags = new ArrayList<>();
    TupleTagList targetTupleTagList = null;
    for ( String targetStep : targetSteps ) {
      String tupleId = KettleBeamUtil.createTargetTupleId( stepname, targetStep );
      TupleTag<KettleRow> tupleTag = new TupleTag<KettleRow>( tupleId ) {
      };
      targetTupleTags.add( tupleTag );
      if ( targetTupleTagList == null ) {
        targetTupleTagList = TupleTagList.of( tupleTag );
      } else {
        targetTupleTagList = targetTupleTagList.and( tupleTag );
      }
    }
    if ( targetTupleTagList == null ) {
      targetTupleTagList = TupleTagList.empty();
    }

    // Create a new step function, initializes the step
    //
    StepFn stepFn = new StepFn( variableValues, metastoreJson, stepPluginClasses, xpPluginClasses,
      stepname, stepPluginId, stepMetaInterfaceXml, inputRowMetaJson, inputStep,
      targetSteps, infoSteps, infoRowMetaJsons );

    // The actual step functionality
    //
    ParDo.SingleOutput<KettleRow, KettleRow> parDoStepFn = ParDo.of( stepFn );

    // Add optional side inputs...
    //
    if ( infoCollectionViews.size() > 0 ) {
      parDoStepFn = parDoStepFn.withSideInputs( infoCollectionViews );
    }

    // Specify the main output and targeted outputs
    //
    ParDo.MultiOutput<KettleRow, KettleRow> multiOutput = parDoStepFn.withOutputTags( mainOutputTupleTag, targetTupleTagList );

    // Apply the multi output parallel do step function to the main input stream
    //
    PCollectionTuple collectionTuple = input.apply( multiOutput );

    // In the tuple is everything we need to find.
    // Just make sure to retrieve the PCollections using the correct Tuple ID
    // Use KettleBeamUtil.createTargetTupleId()... to make sure
    //
    return collectionTuple;
  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error transforming data in step '" + stepname + "'", e );
    throw new RuntimeException( "Error transforming data in step", e );
  }

}
 
Example 20
Source File: TransformBatchTransform.java    From hop with Apache License 2.0 4 votes vote down vote up
@Override public PCollectionTuple expand( PCollection<HopRow> input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamHop.init( transformPluginClasses, xpPluginClasses );

    // Similar for the output : treate a TupleTag list for the target transforms...
    //
    TupleTag<HopRow> mainOutputTupleTag = new TupleTag<HopRow>( HopBeamUtil.createMainOutputTupleId( transformName ) ) {
    };
    List<TupleTag<HopRow>> targetTupleTags = new ArrayList<>();
    TupleTagList targetTupleTagList = null;
    for ( String targetStep : targetSteps ) {
      String tupleId = HopBeamUtil.createTargetTupleId( transformName, targetStep );
      TupleTag<HopRow> tupleTag = new TupleTag<HopRow>( tupleId ) {
      };
      targetTupleTags.add( tupleTag );
      if ( targetTupleTagList == null ) {
        targetTupleTagList = TupleTagList.of( tupleTag );
      } else {
        targetTupleTagList = targetTupleTagList.and( tupleTag );
      }
    }
    if ( targetTupleTagList == null ) {
      targetTupleTagList = TupleTagList.empty();
    }

    // Create a new transform function, initializes the transform
    //
    StepBatchFn stepBatchFn = new StepBatchFn( variableValues, metastoreJson, transformPluginClasses, xpPluginClasses,
      transformName, stepPluginId, stepMetaInterfaceXml, inputRowMetaJson, inputStep,
      targetSteps, infoSteps, infoRowMetaJsons );

    // The actual transform functionality
    //
    ParDo.SingleOutput<HopRow, HopRow> parDoStepFn = ParDo.of( stepBatchFn );

    // Add optional side inputs...
    //
    if ( infoCollectionViews.size() > 0 ) {
      parDoStepFn = parDoStepFn.withSideInputs( infoCollectionViews );
    }

    // Specify the main output and targeted outputs
    //
    ParDo.MultiOutput<HopRow, HopRow> multiOutput = parDoStepFn.withOutputTags( mainOutputTupleTag, targetTupleTagList );

    // Apply the multi output parallel do transform function to the main input stream
    //
    PCollectionTuple collectionTuple = input.apply( multiOutput );

    // In the tuple is everything we need to find.
    // Just make sure to retrieve the PCollections using the correct Tuple ID
    // Use HopBeamUtil.createTargetTupleId()... to make sure
    //
    return collectionTuple;
  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error transforming data in transform '" + transformName + "'", e );
    throw new RuntimeException( "Error transforming data in transform", e );
  }

}