Java Code Examples for org.apache.beam.sdk.values.PCollection#getWindowingStrategy()

The following examples show how to use org.apache.beam.sdk.values.PCollection#getWindowingStrategy() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataflowPipelineTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
private <ElemT, ViewT> void translateTyped(
    View.CreatePCollectionView<ElemT, ViewT> transform, TranslationContext context) {
  StepTranslationContext stepContext =
      context.addStep(transform, "CollectionToSingleton");
  PCollection<ElemT> input = context.getInput(transform);
  stepContext.addInput(PropertyNames.PARALLEL_INPUT, input);
  WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
  stepContext.addInput(
      PropertyNames.WINDOWING_STRATEGY,
      byteArrayToJsonString(
          serializeWindowingStrategy(windowingStrategy, context.getPipelineOptions())));
  stepContext.addInput(
      PropertyNames.IS_MERGING_WINDOW_FN,
      !windowingStrategy.getWindowFn().isNonMerging());
  stepContext.addCollectionToSingletonOutput(
      input, PropertyNames.OUTPUT, transform.getView());
}
 
Example 2
Source File: GroupByKeyViaGroupByKeyOnly.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<K, Iterable<V>>> expand(PCollection<KV<K, V>> input) {
  WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();

  return input
      // Group by just the key.
      // Combiner lifting will not happen regardless of the disallowCombinerLifting value.
      // There will be no combiners right after the GroupByKeyOnly because of the two ParDos
      // introduced in here.
      .apply(new GroupByKeyOnly<>())

      // Sort each key's values by timestamp. GroupAlsoByWindow requires
      // its input to be sorted by timestamp.
      .apply(new SortValuesByTimestamp<>())

      // Group each key's values by window, merging windows as needed.
      .apply(new GroupAlsoByWindow<>(windowingStrategy))

      // And update the windowing strategy as appropriate.
      .setWindowingStrategyInternal(gbkTransform.updateWindowingStrategy(windowingStrategy));
}
 
Example 3
Source File: FlinkStreamingTransformTranslators.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
boolean canTranslate(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
    FlinkStreamingTranslationContext context) {
  // if we have a merging window strategy and side inputs we cannot
  // translate as a proper combine. We have to group and then run the combine
  // over the final grouped values.
  PCollection<KV<K, InputT>> input = context.getInput(transform);

  @SuppressWarnings("unchecked")
  WindowingStrategy<?, BoundedWindow> windowingStrategy =
      (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();

  return windowingStrategy.getWindowFn().isNonMerging()
      || ((Combine.PerKey) transform).getSideInputs().isEmpty();
}
 
Example 4
Source File: Utils.java    From beam with Apache License 2.0 6 votes vote down vote up
static WindowingStrategy<?, ?> getWindowingStrategy(AppliedPTransform<?, ?, ?> appliedTransform) {
  // assume that the windowing strategy is the same for all outputs

  Map<TupleTag<?>, PValue> outputs = getOutputs(appliedTransform);

  if (outputs == null || outputs.isEmpty()) {
    throw new IllegalStateException("No outputs defined.");
  }

  PValue taggedValue = outputs.values().iterator().next();
  checkState(
      taggedValue instanceof PCollection,
      "Within ParDo, got a non-PCollection output %s of type %s",
      taggedValue,
      taggedValue.getClass().getSimpleName());
  PCollection<?> coll = (PCollection<?>) taggedValue;
  return coll.getWindowingStrategy();
}
 
Example 5
Source File: BeamCoGBKJoinRel.java    From beam with Apache License 2.0 5 votes vote down vote up
private <T> void verifySupportedTrigger(PCollection<T> pCollection) {
  WindowingStrategy windowingStrategy = pCollection.getWindowingStrategy();

  if (UNBOUNDED.equals(pCollection.isBounded()) && !triggersOncePerWindow(windowingStrategy)) {
    throw new UnsupportedOperationException(
        "Joining unbounded PCollections is currently only supported for "
            + "non-global windows with triggers that are known to produce output once per window,"
            + "such as the default trigger with zero allowed lateness. "
            + "In these cases Beam can guarantee it joins all input elements once per window. "
            + windowingStrategy
            + " is not supported");
  }
}
 
Example 6
Source File: BeamAggregationRel.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Performs the same check as {@link GroupByKey}, provides more context in exception.
 *
 * <p>Verifies that the input PCollection is bounded, or that there is windowing/triggering
 * being used. Without this, the watermark (at end of global window) will never be reached.
 *
 * <p>Throws {@link UnsupportedOperationException} if validation fails.
 */
private void validateWindowIsSupported(PCollection<Row> upstream) {
  WindowingStrategy<?, ?> windowingStrategy = upstream.getWindowingStrategy();
  if (windowingStrategy.getWindowFn() instanceof GlobalWindows
      && windowingStrategy.getTrigger() instanceof DefaultTrigger
      && upstream.isBounded() != BOUNDED) {

    throw new UnsupportedOperationException(
        "Please explicitly specify windowing in SQL query using HOP/TUMBLE/SESSION functions "
            + "(default trigger will be used in this case). "
            + "Unbounded input with global windowing and default trigger is not supported "
            + "in Beam SQL aggregations. "
            + "See GroupByKey section in Beam Programming Guide");
  }
}
 
Example 7
Source File: GroupByKeyTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, InputT, OutputT> void doTranslate(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
    TransformHierarchy.Node node,
    TranslationContext ctx) {
  final PCollection<KV<K, InputT>> input = ctx.getInput(transform);

  final PCollection<KV<K, OutputT>> output = ctx.getOutput(transform);
  final TupleTag<KV<K, OutputT>> outputTag = ctx.getOutputTag(transform);

  @SuppressWarnings("unchecked")
  final WindowingStrategy<?, BoundedWindow> windowingStrategy =
      (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();

  final MessageStream<OpMessage<KV<K, InputT>>> inputStream = ctx.getMessageStream(input);

  final KvCoder<K, InputT> kvInputCoder = (KvCoder<K, InputT>) input.getCoder();
  final Coder<WindowedValue<KV<K, InputT>>> elementCoder = SamzaCoders.of(input);

  final SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> reduceFn =
      getSystemReduceFn(transform, input.getPipeline(), kvInputCoder);

  final MessageStream<OpMessage<KV<K, OutputT>>> outputStream =
      doTranslateGBK(
          inputStream,
          needRepartition(node, ctx),
          reduceFn,
          windowingStrategy,
          kvInputCoder,
          elementCoder,
          ctx.getTransformFullName(),
          ctx.getTransformId(),
          outputTag,
          input.isBounded());

  ctx.registerMessageStream(output, outputStream);
}
 
Example 8
Source File: ReshuffleOverrideFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<K, V>> expand(PCollection<KV<K, V>> input) {
  WindowingStrategy<?, ?> originalStrategy = input.getWindowingStrategy();
  // If the input has already had its windows merged, then the GBK that performed the merge
  // will have set originalStrategy.getWindowFn() to InvalidWindows, causing the GBK contained
  // here to fail. Instead, we install a valid WindowFn that leaves all windows unchanged.
  Window<KV<K, V>> rewindow =
      Window.<KV<K, V>>into(
              new IdentityWindowFn<>(originalStrategy.getWindowFn().windowCoder()))
          .triggering(new ReshuffleTrigger<>())
          .discardingFiredPanes()
          .withTimestampCombiner(TimestampCombiner.EARLIEST)
          .withAllowedLateness(Duration.millis(BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis()));

  return input
      .apply(rewindow)
      .apply(GroupByKey.create())
      // Set the windowing strategy directly, so that it doesn't get counted as the user having
      // set allowed lateness.
      .setWindowingStrategyInternal(originalStrategy)
      .apply(
          "ExpandIterable",
          ParDo.of(
              new DoFn<KV<K, Iterable<V>>, KV<K, V>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  K key = c.element().getKey();
                  for (V value : c.element().getValue()) {
                    c.output(KV.of(key, value));
                  }
                }
              }));
}
 
Example 9
Source File: Flatten.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollectionList<T> inputs) {
  WindowingStrategy<?, ?> windowingStrategy;
  IsBounded isBounded = IsBounded.BOUNDED;
  if (!inputs.getAll().isEmpty()) {
    windowingStrategy = inputs.get(0).getWindowingStrategy();
    for (PCollection<?> input : inputs.getAll()) {
      WindowingStrategy<?, ?> other = input.getWindowingStrategy();
      if (!windowingStrategy.getWindowFn().isCompatible(other.getWindowFn())) {
        throw new IllegalStateException(
            "Inputs to Flatten had incompatible window windowFns: "
                + windowingStrategy.getWindowFn()
                + ", "
                + other.getWindowFn());
      }

      if (!windowingStrategy.getTrigger().isCompatible(other.getTrigger())) {
        throw new IllegalStateException(
            "Inputs to Flatten had incompatible triggers: "
                + windowingStrategy.getTrigger()
                + ", "
                + other.getTrigger());
      }
      isBounded = isBounded.and(input.isBounded());
    }
  } else {
    windowingStrategy = WindowingStrategy.globalDefault();
  }

  return PCollection.createPrimitiveOutputInternal(
      inputs.getPipeline(),
      windowingStrategy,
      isBounded,
      // Take coder from first collection. If there are none, will be left unspecified.
      inputs.getAll().isEmpty() ? null : inputs.get(0).getCoder());
}
 
Example 10
Source File: GroupByKeyTranslatorBatch.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(GroupByKey<K, V> transform, Twister2BatchTranslationContext context) {
  PCollection<KV<K, V>> input = context.getInput(transform);
  BatchTSetImpl<WindowedValue<KV<K, V>>> inputTTSet = context.getInputDataSet(input);
  final KvCoder<K, V> coder = (KvCoder<K, V>) input.getCoder();
  Coder<K> inputKeyCoder = coder.getKeyCoder();
  WindowingStrategy windowingStrategy = input.getWindowingStrategy();
  WindowFn<KV<K, V>, BoundedWindow> windowFn =
      (WindowFn<KV<K, V>, BoundedWindow>) windowingStrategy.getWindowFn();
  final WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
  KeyedTSet<byte[], byte[]> keyedTSet =
      inputTTSet.mapToTuple(new MapToTupleFunction<K, V>(inputKeyCoder, wvCoder));

  // todo add support for a partition function to be specified, this would use
  // todo keyedPartition function instead of KeyedGather
  ComputeTSet<KV<K, Iterable<WindowedValue<V>>>, Iterator<Tuple<byte[], Iterator<byte[]>>>>
      groupedbyKeyTset =
          keyedTSet.keyedGather().map(new ByteToWindowFunction(inputKeyCoder, wvCoder));

  // --- now group also by window.
  SystemReduceFnBuffering reduceFnBuffering = new SystemReduceFnBuffering(coder.getValueCoder());
  ComputeTSet<WindowedValue<KV<K, Iterable<V>>>, Iterable<KV<K, Iterator<WindowedValue<V>>>>>
      outputTset =
          groupedbyKeyTset
              .direct()
              .<WindowedValue<KV<K, Iterable<V>>>>flatmap(
                  new GroupByWindowFunction(
                      windowingStrategy, reduceFnBuffering, context.getOptions()));
  PCollection output = context.getOutput(transform);
  context.setOutputDataSet(output, outputTset);
}
 
Example 11
Source File: PCollectionViewTranslatorBatch.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(
    View.CreatePCollectionView<ElemT, ViewT> transform, Twister2BatchTranslationContext context) {
  BatchTSet<WindowedValue<ElemT>> inputDataSet =
      context.getInputDataSet(context.getInput(transform));
  @SuppressWarnings("unchecked")
  AppliedPTransform<
          PCollection<ElemT>,
          PCollection<ElemT>,
          PTransform<PCollection<ElemT>, PCollection<ElemT>>>
      application =
          (AppliedPTransform<
                  PCollection<ElemT>,
                  PCollection<ElemT>,
                  PTransform<PCollection<ElemT>, PCollection<ElemT>>>)
              context.getCurrentTransform();
  org.apache.beam.sdk.values.PCollectionView<ViewT> input;
  PCollection<ElemT> inputPCol = context.getInput(transform);
  final KvCoder coder = (KvCoder) inputPCol.getCoder();
  Coder inputKeyCoder = coder.getKeyCoder();
  WindowingStrategy windowingStrategy = inputPCol.getWindowingStrategy();
  WindowFn windowFn = windowingStrategy.getWindowFn();
  final WindowedValue.WindowedValueCoder wvCoder =
      WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
  BatchTSet<WindowedValue<ElemT>> inputGathered =
      inputDataSet
          .direct()
          .map(new MapToTupleFunction<>(inputKeyCoder, wvCoder))
          .allGather()
          .map(new ByteToWindowFunctionPrimitive(inputKeyCoder, wvCoder));
  try {
    input = CreatePCollectionViewTranslation.getView(application);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  context.setSideInputDataSet(input.getTagInternal().getId(), inputGathered);
}
 
Example 12
Source File: BatchStatefulParDoOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<K, Iterable<KV<Instant, WindowedValue<KV<K, V>>>>>> expand(
    PCollection<KV<K, V>> input) {

  WindowingStrategy<?, ?> inputWindowingStrategy = input.getWindowingStrategy();

  // A KvCoder is required since this goes through GBK. Further, WindowedValueCoder
  // is not registered by default, so we explicitly set the relevant coders.
  checkState(
      input.getCoder() instanceof KvCoder,
      "Input to a %s using state requires a %s, but the coder was %s",
      ParDo.class.getSimpleName(),
      KvCoder.class.getSimpleName(),
      input.getCoder());
  KvCoder<K, V> kvCoder = (KvCoder<K, V>) input.getCoder();
  Coder<K> keyCoder = kvCoder.getKeyCoder();
  Coder<? extends BoundedWindow> windowCoder =
      inputWindowingStrategy.getWindowFn().windowCoder();

  return input
      // Stash the original timestamps, etc, for when it is fed to the user's DoFn
      .apply("ReifyWindows", ParDo.of(new ReifyWindowedValueFn<>()))
      .setCoder(
          KvCoder.of(
              keyCoder,
              KvCoder.of(InstantCoder.of(), WindowedValue.getFullCoder(kvCoder, windowCoder))))

      // Group by key and sort by timestamp, dropping windows as they are reified
      .apply("PartitionKeys", new GroupByKeyAndSortValuesOnly<>())

      // The GBKO sets the windowing strategy to the global default
      .setWindowingStrategyInternal(inputWindowingStrategy);
}
 
Example 13
Source File: JetTransformTranslators.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public Vertex translate(
    Pipeline pipeline,
    AppliedPTransform<?, ?, ?> appliedTransform,
    Node node,
    JetTranslationContext context) {
  String transformName = appliedTransform.getFullName();

  PCollection<KV<K, InputT>> input =
      (PCollection<KV<K, InputT>>) Utils.getInput(appliedTransform);
  WindowedValue.WindowedValueCoder<KV<K, InputT>> inputCoder =
      Utils.getWindowedValueCoder(input);
  Map.Entry<TupleTag<?>, PValue> output = Utils.getOutput(appliedTransform);
  Coder outputCoder = Utils.getCoder((PCollection) output.getValue());

  WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();

  DAGBuilder dagBuilder = context.getDagBuilder();
  String vertexId = dagBuilder.newVertexId(transformName);
  Vertex vertex =
      dagBuilder.addVertex(
          vertexId,
          WindowGroupP.supplier(
              context.getOptions(), inputCoder, outputCoder, windowingStrategy, vertexId));

  dagBuilder.registerEdgeEndPoint(Utils.getTupleTagId(input), vertex);

  String outputEdgeId = Utils.getTupleTagId(output.getValue());
  dagBuilder.registerCollectionOfEdge(outputEdgeId, output.getKey().getId());
  dagBuilder.registerEdgeStartPoint(outputEdgeId, vertex, outputCoder);
  return vertex;
}
 
Example 14
Source File: TransformTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <K, InputT, AccumT, OutputT>
    TransformEvaluator<Combine.PerKey<K, InputT, OutputT>> combinePerKey() {
  return new TransformEvaluator<Combine.PerKey<K, InputT, OutputT>>() {
    @Override
    public void evaluate(
        Combine.PerKey<K, InputT, OutputT> transform, EvaluationContext context) {
      final PCollection<KV<K, InputT>> input = context.getInput(transform);
      // serializable arguments to pass.
      final KvCoder<K, InputT> inputCoder =
          (KvCoder<K, InputT>) context.getInput(transform).getCoder();
      @SuppressWarnings("unchecked")
      final CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT> combineFn =
          (CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT>)
              CombineFnUtil.toFnWithContext(transform.getFn());
      final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
      final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs =
          TranslationUtils.getSideInputs(transform.getSideInputs(), context);
      final SparkCombineFn<KV<K, InputT>, InputT, AccumT, OutputT> sparkCombineFn =
          SparkCombineFn.keyed(
              combineFn, context.getSerializableOptions(), sideInputs, windowingStrategy);
      final Coder<AccumT> vaCoder;
      try {
        vaCoder =
            combineFn.getAccumulatorCoder(
                context.getPipeline().getCoderRegistry(), inputCoder.getValueCoder());
      } catch (CannotProvideCoderException e) {
        throw new IllegalStateException("Could not determine coder for accumulator", e);
      }

      @SuppressWarnings("unchecked")
      JavaRDD<WindowedValue<KV<K, InputT>>> inRdd =
          ((BoundedDataset<KV<K, InputT>>) context.borrowDataset(transform)).getRDD();

      JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>>
          accumulatePerKey;
      accumulatePerKey =
          GroupCombineFunctions.combinePerKey(
              inRdd,
              sparkCombineFn,
              inputCoder.getKeyCoder(),
              inputCoder.getValueCoder(),
              vaCoder,
              windowingStrategy);

      JavaPairRDD<K, WindowedValue<OutputT>> kwvs =
          SparkCompat.extractOutput(accumulatePerKey, sparkCombineFn);
      JavaRDD<WindowedValue<KV<K, OutputT>>> outRdd =
          kwvs.map(new TranslationUtils.FromPairFunction())
              .map(new TranslationUtils.ToKVByWindowInValueFunction<>());

      context.putDataset(transform, new BoundedDataset<>(outRdd));
    }

    @Override
    public String toNativeString() {
      return "combineByKey(..., new <fn>(), ...)";
    }
  };
}
 
Example 15
Source File: TransformTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <InputT, AccumT, OutputT>
    TransformEvaluator<Combine.Globally<InputT, OutputT>> combineGlobally() {
  return new TransformEvaluator<Combine.Globally<InputT, OutputT>>() {

    @Override
    public void evaluate(Combine.Globally<InputT, OutputT> transform, EvaluationContext context) {
      final PCollection<InputT> input = context.getInput(transform);
      final Coder<InputT> iCoder = context.getInput(transform).getCoder();
      final Coder<OutputT> oCoder = context.getOutput(transform).getCoder();
      final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT> combineFn =
          (CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT>)
              CombineFnUtil.toFnWithContext(transform.getFn());
      final WindowedValue.FullWindowedValueCoder<OutputT> wvoCoder =
          WindowedValue.FullWindowedValueCoder.of(
              oCoder, windowingStrategy.getWindowFn().windowCoder());
      final boolean hasDefault = transform.isInsertDefault();

      final SparkCombineFn<InputT, InputT, AccumT, OutputT> sparkCombineFn =
          SparkCombineFn.globally(
              combineFn,
              context.getSerializableOptions(),
              TranslationUtils.getSideInputs(transform.getSideInputs(), context),
              windowingStrategy);
      final Coder<AccumT> aCoder;
      try {
        aCoder = combineFn.getAccumulatorCoder(context.getPipeline().getCoderRegistry(), iCoder);
      } catch (CannotProvideCoderException e) {
        throw new IllegalStateException("Could not determine coder for accumulator", e);
      }

      @SuppressWarnings("unchecked")
      JavaRDD<WindowedValue<InputT>> inRdd =
          ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();

      JavaRDD<WindowedValue<OutputT>> outRdd;

      SparkCombineFn.WindowedAccumulator<InputT, InputT, AccumT, ?> accumulated =
          GroupCombineFunctions.combineGlobally(inRdd, sparkCombineFn, aCoder, windowingStrategy);

      if (!accumulated.isEmpty()) {
        Iterable<WindowedValue<OutputT>> output = sparkCombineFn.extractOutput(accumulated);
        outRdd =
            context
                .getSparkContext()
                .parallelize(CoderHelpers.toByteArrays(output, wvoCoder))
                .map(CoderHelpers.fromByteFunction(wvoCoder));
      } else {
        // handle empty input RDD, which will naturally skip the entire execution
        // as Spark will not run on empty RDDs.
        JavaSparkContext jsc = new JavaSparkContext(inRdd.context());
        if (hasDefault) {
          OutputT defaultValue = combineFn.defaultValue();
          outRdd =
              jsc.parallelize(Lists.newArrayList(CoderHelpers.toByteArray(defaultValue, oCoder)))
                  .map(CoderHelpers.fromByteFunction(oCoder))
                  .map(WindowedValue::valueInGlobalWindow);
        } else {
          outRdd = jsc.emptyRDD();
        }
      }

      context.putDataset(transform, new BoundedDataset<>(outRdd));
    }

    @Override
    public String toNativeString() {
      return "aggregate(..., new <fn>(), ...)";
    }
  };
}
 
Example 16
Source File: ParDoMultiOverrideFactory.java    From beam with Apache License 2.0 4 votes vote down vote up
@VisibleForTesting
PCollection<KeyedWorkItem<K, KV<K, InputT>>> groupToKeyedWorkItem(
    PCollection<KV<K, InputT>> input) {

  WindowingStrategy<?, ?> inputWindowingStrategy = input.getWindowingStrategy();

  // A KvCoder is required since this goes through GBK. Further, WindowedValueCoder
  // is not registered by default, so we explicitly set the relevant coders.
  checkState(
      input.getCoder() instanceof KvCoder,
      "Input to a %s using state requires a %s, but the coder was %s",
      ParDo.class.getSimpleName(),
      KvCoder.class.getSimpleName(),
      input.getCoder());

  KvCoder<K, InputT> kvCoder = (KvCoder<K, InputT>) input.getCoder();
  Coder<K> keyCoder = kvCoder.getKeyCoder();
  Coder<? extends BoundedWindow> windowCoder =
      inputWindowingStrategy.getWindowFn().windowCoder();

  return input
      // Stash the original timestamps, etc, for when it is fed to the user's DoFn
      .apply("Reify timestamps", ParDo.of(new ReifyWindowedValueFn<>()))
      .setCoder(KvCoder.of(keyCoder, WindowedValue.getFullCoder(kvCoder, windowCoder)))

      // We are going to GBK to gather keys and windows but otherwise do not want
      // to alter the flow of data. This entails:
      //  - trigger as fast as possible
      //  - maintain the full timestamps of elements
      //  - ensure this GBK holds to the minimum of those timestamps (via TimestampCombiner)
      //  - discard past panes as it is "just a stream" of elements
      .apply(
          Window.<KV<K, WindowedValue<KV<K, InputT>>>>configure()
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .discardingFiredPanes()
              .withAllowedLateness(inputWindowingStrategy.getAllowedLateness())
              .withTimestampCombiner(TimestampCombiner.EARLIEST))

      // A full GBK to group by key _and_ window
      .apply("Group by key", GroupByKey.create())

      // Adapt to KeyedWorkItem; that is how this runner delivers timers
      .apply("To KeyedWorkItem", ParDo.of(new ToKeyedWorkItem<>()))
      .setCoder(KeyedWorkItemCoder.of(keyCoder, kvCoder, windowCoder))

      // Because of the intervening GBK, we may have abused the windowing strategy
      // of the input, which should be transferred to the output in a straightforward manner
      // according to what ParDo already does.
      .setWindowingStrategyInternal(inputWindowingStrategy);
}
 
Example 17
Source File: CombinePerKeyTranslatorBatch.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void translateTransform(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
    TranslationContext context) {

  Combine.PerKey combineTransform = (Combine.PerKey) transform;
  @SuppressWarnings("unchecked")
  final PCollection<KV<K, InputT>> input = (PCollection<KV<K, InputT>>) context.getInput();
  @SuppressWarnings("unchecked")
  final PCollection<KV<K, OutputT>> output = (PCollection<KV<K, OutputT>>) context.getOutput();
  @SuppressWarnings("unchecked")
  final Combine.CombineFn<InputT, AccumT, OutputT> combineFn =
      (Combine.CombineFn<InputT, AccumT, OutputT>) combineTransform.getFn();
  WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();

  Dataset<WindowedValue<KV<K, InputT>>> inputDataset = context.getDataset(input);

  KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) input.getCoder();
  Coder<K> keyCoder = inputCoder.getKeyCoder();
  KvCoder<K, OutputT> outputKVCoder = (KvCoder<K, OutputT>) output.getCoder();
  Coder<OutputT> outputCoder = outputKVCoder.getValueCoder();

  KeyValueGroupedDataset<K, WindowedValue<KV<K, InputT>>> groupedDataset =
      inputDataset.groupByKey(KVHelpers.extractKey(), EncoderHelpers.fromBeamCoder(keyCoder));

  Coder<AccumT> accumulatorCoder = null;
  try {
    accumulatorCoder =
        combineFn.getAccumulatorCoder(
            input.getPipeline().getCoderRegistry(), inputCoder.getValueCoder());
  } catch (CannotProvideCoderException e) {
    throw new RuntimeException(e);
  }

  Dataset<Tuple2<K, Iterable<WindowedValue<OutputT>>>> combinedDataset =
      groupedDataset.agg(
          new AggregatorCombiner<K, InputT, AccumT, OutputT, BoundedWindow>(
                  combineFn, windowingStrategy, accumulatorCoder, outputCoder)
              .toColumn());

  // expand the list into separate elements and put the key back into the elements
  WindowedValue.WindowedValueCoder<KV<K, OutputT>> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(
          outputKVCoder, input.getWindowingStrategy().getWindowFn().windowCoder());
  Dataset<WindowedValue<KV<K, OutputT>>> outputDataset =
      combinedDataset.flatMap(
          (FlatMapFunction<
                  Tuple2<K, Iterable<WindowedValue<OutputT>>>, WindowedValue<KV<K, OutputT>>>)
              tuple2 -> {
                K key = tuple2._1();
                Iterable<WindowedValue<OutputT>> windowedValues = tuple2._2();
                List<WindowedValue<KV<K, OutputT>>> result = new ArrayList<>();
                for (WindowedValue<OutputT> windowedValue : windowedValues) {
                  KV<K, OutputT> kv = KV.of(key, windowedValue.getValue());
                  result.add(
                      WindowedValue.of(
                          kv,
                          windowedValue.getTimestamp(),
                          windowedValue.getWindows(),
                          windowedValue.getPane()));
                }
                return result.iterator();
              },
          EncoderHelpers.fromBeamCoder(wvCoder));
  context.putDataset(output, outputDataset);
}
 
Example 18
Source File: FlinkStreamingTransformTranslators.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void translateNode(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, Iterable<InputT>>>> transform,
    FlinkStreamingTranslationContext context) {

  PCollection<KV<K, InputT>> input = context.getInput(transform);

  @SuppressWarnings("unchecked")
  WindowingStrategy<?, BoundedWindow> windowingStrategy =
      (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();

  KvCoder<K, InputT> inputKvCoder = (KvCoder<K, InputT>) input.getCoder();

  SingletonKeyedWorkItemCoder<K, InputT> workItemCoder =
      SingletonKeyedWorkItemCoder.of(
          inputKvCoder.getKeyCoder(),
          inputKvCoder.getValueCoder(),
          input.getWindowingStrategy().getWindowFn().windowCoder());

  DataStream<WindowedValue<KV<K, InputT>>> inputDataStream = context.getInputDataStream(input);

  WindowedValue.FullWindowedValueCoder<SingletonKeyedWorkItem<K, InputT>>
      windowedWorkItemCoder =
          WindowedValue.getFullCoder(
              workItemCoder, input.getWindowingStrategy().getWindowFn().windowCoder());

  CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemTypeInfo =
      new CoderTypeInformation<>(windowedWorkItemCoder);

  DataStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemStream =
      inputDataStream
          .flatMap(new ToKeyedWorkItem<>(context.getPipelineOptions()))
          .returns(workItemTypeInfo)
          .name("ToKeyedWorkItem");

  WorkItemKeySelector keySelector = new WorkItemKeySelector<>(inputKvCoder.getKeyCoder());

  KeyedStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>, ByteBuffer>
      keyedWorkItemStream =
          workItemStream.keyBy(new WorkItemKeySelector<>(inputKvCoder.getKeyCoder()));

  SystemReduceFn<K, InputT, Iterable<InputT>, Iterable<InputT>, BoundedWindow> reduceFn =
      SystemReduceFn.buffering(inputKvCoder.getValueCoder());

  Coder<WindowedValue<KV<K, Iterable<InputT>>>> outputCoder =
      context.getWindowedInputCoder(context.getOutput(transform));
  TypeInformation<WindowedValue<KV<K, Iterable<InputT>>>> outputTypeInfo =
      context.getTypeInfo(context.getOutput(transform));

  TupleTag<KV<K, Iterable<InputT>>> mainTag = new TupleTag<>("main output");

  String fullName = getCurrentTransformName(context);
  WindowDoFnOperator<K, InputT, Iterable<InputT>> doFnOperator =
      new WindowDoFnOperator<>(
          reduceFn,
          fullName,
          (Coder) windowedWorkItemCoder,
          mainTag,
          Collections.emptyList(),
          new DoFnOperator.MultiOutputOutputManagerFactory<>(mainTag, outputCoder),
          windowingStrategy,
          new HashMap<>(), /* side-input mapping */
          Collections.emptyList(), /* side inputs */
          context.getPipelineOptions(),
          inputKvCoder.getKeyCoder(),
          keySelector);

  // our operator expects WindowedValue<KeyedWorkItem> while our input stream
  // is WindowedValue<SingletonKeyedWorkItem>, which is fine but Java doesn't like it ...
  @SuppressWarnings("unchecked")
  SingleOutputStreamOperator<WindowedValue<KV<K, Iterable<InputT>>>> outDataStream =
      keyedWorkItemStream
          .transform(fullName, outputTypeInfo, (OneInputStreamOperator) doFnOperator)
          .uid(fullName);

  context.setOutputDataStream(context.getOutput(transform), outDataStream);
}
 
Example 19
Source File: StatefulParDoEvaluatorFactory.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public Runnable load(
    final AppliedPTransformOutputKeyAndWindow<K, InputT, OutputT> transformOutputWindow) {
  String stepName = evaluationContext.getStepName(transformOutputWindow.getTransform());

  Map<TupleTag<?>, PCollection<?>> taggedValues = new HashMap<>();
  for (Entry<TupleTag<?>, PValue> pv :
      transformOutputWindow.getTransform().getOutputs().entrySet()) {
    taggedValues.put(pv.getKey(), (PCollection<?>) pv.getValue());
  }
  PCollection<?> pc =
      taggedValues.get(transformOutputWindow.getTransform().getTransform().getMainOutputTag());
  WindowingStrategy<?, ?> windowingStrategy = pc.getWindowingStrategy();
  BoundedWindow window = transformOutputWindow.getWindow();
  final DoFn<?, ?> doFn = transformOutputWindow.getTransform().getTransform().getDoFn();
  final DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());

  final DirectStepContext stepContext =
      evaluationContext
          .getExecutionContext(
              transformOutputWindow.getTransform(), transformOutputWindow.getKey())
          .getStepContext(stepName);

  final StateNamespace namespace =
      StateNamespaces.window(
          (Coder<BoundedWindow>) windowingStrategy.getWindowFn().windowCoder(), window);

  Runnable cleanup =
      () -> {
        for (StateDeclaration stateDecl : signature.stateDeclarations().values()) {
          StateTag<?> tag;
          try {
            tag = StateTags.tagForSpec(stateDecl.id(), (StateSpec) stateDecl.field().get(doFn));
          } catch (IllegalAccessException e) {
            throw new RuntimeException(
                String.format(
                    "Error accessing %s for %s",
                    StateSpec.class.getName(), doFn.getClass().getName()),
                e);
          }
          stepContext.stateInternals().state(namespace, tag).clear();
        }
        cleanupRegistry.invalidate(transformOutputWindow);
      };

  evaluationContext.scheduleAfterWindowExpiration(
      transformOutputWindow.getTransform(), window, windowingStrategy, cleanup);
  return cleanup;
}
 
Example 20
Source File: UnboundedWrite.java    From components with Apache License 2.0 2 votes vote down vote up
/**
 * Applies a window to the input collection if one hasn't already been specified.
 *
 * @return the input collection if it already has been windowed, otherwise a the same collection inside a default
 * window.
 */
public static <T> PCollection<T> ofDefaultWindow(PCollection<T> in) {
    if (in.getWindowingStrategy() != WindowingStrategy.globalDefault() && in.getWindowingStrategy() != null)
        return in;
    return in.apply("ApplyDefaultWindow", Window.<T> into(FixedWindows.of(DEFAULT_WINDOW_SIZE)));
}