com.google.cloud.dataflow.sdk.transforms.Combine Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.transforms.Combine. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BreakFusion.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> apply(PCollection<T> input) {
  return input
      .apply(ParDo.named("BreakFusion").of(new DummyMapFn<T>()))
      .apply(Combine.<String, T>perKey(new First<T>()))
      .apply(Values.<T>create());
}
 
Example #2
Source File: DockerDo.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollection<KV<String, WorkflowArgs>> input) {
  return input
      .apply(ParDo.named("Prepare").of(new Gather(task)))
      .apply(Combine.perKey(new SortArgs()))
      .apply(ParDo.named("CombineOutputs").of(new CombineArgs()));
}
 
Example #3
Source File: MergeBranches.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollectionList<KV<String, WorkflowArgs>> input) {
  return input
      .apply(Flatten.<KV<String, WorkflowArgs>>pCollections())
      .apply(Combine.globally(new Merge()));
}
 
Example #4
Source File: LatestRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("key rides by rideid",
      MapElements.via((TableRow ride) -> KV.of(ride.get("ride_id").toString(), ride))
        .withOutputType(new TypeDescriptor<KV<String, TableRow>>() {}))

   .apply("session windows on rides with early firings",
      Window.<KV<String, TableRow>>into(
        Sessions.withGapDuration(Duration.standardMinutes(60)))
          .triggering(
            AfterWatermark.pastEndOfWindow()
              .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.millis(2000))))
          .accumulatingFiredPanes()
          .withAllowedLateness(Duration.ZERO))

   .apply("group ride points on same ride", Combine.perKey(new LatestPointCombine()))

   .apply("discard key",
      MapElements.via((KV<String, TableRow> a) -> a.getValue())
        .withOutputType(TypeDescriptor.of(TableRow.class)))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}
 
Example #5
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 5 votes vote down vote up
public PCollection<KV<String, TSProto>> generateCompleteWindowData(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  LOG.info("Check to see that time streams with missing 'ticks' have been corrected");

  PCollection<KV<String, TSProto>> tsData = setupDataInput(pipeline, data);


  PCollection<KV<String, TSProto>> windowedData =
      tsData.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(FixedWindows
          .of(Duration.standardSeconds(((FXTimeSeriesPipelineOptions) pipeline.getOptions())
              .getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues =
      windowedData
          .apply(
              "DetectMissingTimeSeriesValues",
              Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
                  .withoutDefaults()).apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
          .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues)
          .apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections());


  return completeWindowData;
}
 
Example #6
Source File: FlinkAbstractParDoWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(String name, Combine.CombineFn<AggInputT, ?, AggOutputT> combiner) {
	Accumulator acc = getRuntimeContext().getAccumulator(name);
	if (acc != null) {
		AccumulatorHelper.compareAccumulatorTypes(name,
				SerializableFnAggregatorWrapper.class, acc.getClass());
		return (Aggregator<AggInputT, AggOutputT>) acc;
	}

	SerializableFnAggregatorWrapper<AggInputT, AggOutputT> accumulator =
			new SerializableFnAggregatorWrapper<>(combiner);
	getRuntimeContext().addAccumulator(name, accumulator);
	return accumulator;
}
 
Example #7
Source File: FlinkGroupAlsoByWindowWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(String name, Combine.CombineFn<AggInputT, ?, AggOutputT> combiner) {
	Accumulator acc = getRuntimeContext().getAccumulator(name);
	if (acc != null) {
		AccumulatorHelper.compareAccumulatorTypes(name,
				SerializableFnAggregatorWrapper.class, acc.getClass());
		return (Aggregator<AggInputT, AggOutputT>) acc;
	}

	SerializableFnAggregatorWrapper<AggInputT, AggOutputT> accumulator =
			new SerializableFnAggregatorWrapper<>(combiner);
	getRuntimeContext().addAccumulator(name, accumulator);
	return accumulator;
}
 
Example #8
Source File: FlinkGroupAlsoByWindowWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private FlinkGroupAlsoByWindowWrapper(PipelineOptions options,
                                      CoderRegistry registry,
                                      WindowingStrategy<KV<K, VIN>, BoundedWindow> windowingStrategy,
                                      KvCoder<K, VIN> inputCoder,
                                      Combine.KeyedCombineFn<K, VIN, VACC, VOUT> combiner) {
	Preconditions.checkNotNull(options);

	this.options = Preconditions.checkNotNull(options);
	this.coderRegistry = Preconditions.checkNotNull(registry);
	this.inputKvCoder = Preconditions.checkNotNull(inputCoder);//(KvCoder<K, VIN>) input.getCoder();
	this.windowingStrategy = Preconditions.checkNotNull(windowingStrategy);//input.getWindowingStrategy();
	this.combineFn = combiner;
	this.operator = createGroupAlsoByWindowOperator();
	this.chainingStrategy = ChainingStrategy.ALWAYS;
}
 
Example #9
Source File: FlinkGroupAlsoByWindowWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
/**
 * Creates an DataStream where elements are grouped in windows based on the specified windowing strategy.
 * This method assumes that <b>elements are already grouped by key</b>.
 * <p/>
 * The difference with {@link #createForIterable(PipelineOptions, PCollection, KeyedStream)}
 * is that this method assumes that a combiner function is provided
 * (see {@link com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn}).
 * A combiner helps at increasing the speed and, in most of the cases, reduce the per-window state.
 *
 * @param options            the general job configuration options.
 * @param input              the input Dataflow {@link com.google.cloud.dataflow.sdk.values.PCollection}.
 * @param groupedStreamByKey the input stream, it is assumed to already be grouped by key.
 * @param combiner           the combiner to be used.
 * @param outputKvCoder      the type of the output values.
 */
public static <K, VIN, VACC, VOUT> DataStream<WindowedValue<KV<K, VOUT>>> create(
		PipelineOptions options,
		PCollection input,
		KeyedStream<WindowedValue<KV<K, VIN>>, K> groupedStreamByKey,
		Combine.KeyedCombineFn<K, VIN, VACC, VOUT> combiner,
		KvCoder<K, VOUT> outputKvCoder) {
	Preconditions.checkNotNull(options);

	KvCoder<K, VIN> inputKvCoder = (KvCoder<K, VIN>) input.getCoder();
	FlinkGroupAlsoByWindowWrapper windower = new FlinkGroupAlsoByWindowWrapper<>(options,
			input.getPipeline().getCoderRegistry(), input.getWindowingStrategy(), inputKvCoder, combiner);

	Coder<WindowedValue<KV<K, VOUT>>> windowedOutputElemCoder = WindowedValue.FullWindowedValueCoder.of(
			outputKvCoder,
			input.getWindowingStrategy().getWindowFn().windowCoder());

	CoderTypeInformation<WindowedValue<KV<K, VOUT>>> outputTypeInfo =
			new CoderTypeInformation<>(windowedOutputElemCoder);

	DataStream<WindowedValue<KV<K, VOUT>>> groupedByKeyAndWindow = groupedStreamByKey
			.transform("GroupByWindowWithCombiner",
					new CoderTypeInformation<>(outputKvCoder),
					windower)
			.returns(outputTypeInfo);

	return groupedByKeyAndWindow;
}
 
Example #10
Source File: FlinkGroupAlsoByWindowWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static <K, VIN, VACC, VOUT> FlinkGroupAlsoByWindowWrapper
createForTesting(PipelineOptions options,
                 CoderRegistry registry,
                 WindowingStrategy<KV<K, VIN>, BoundedWindow> windowingStrategy,
                 KvCoder<K, VIN> inputCoder,
                 Combine.KeyedCombineFn<K, VIN, VACC, VOUT> combiner) {
	Preconditions.checkNotNull(options);

	return new FlinkGroupAlsoByWindowWrapper(options, registry, windowingStrategy, inputCoder, combiner);
}
 
Example #11
Source File: FlinkDoFnFunction.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(String name, Combine.CombineFn<AggInputT, ?, AggOutputT> combiner) {
	SerializableFnAggregatorWrapper<AggInputT, AggOutputT> wrapper = new SerializableFnAggregatorWrapper<>(combiner);
	getRuntimeContext().addAccumulator(name, wrapper);
	return wrapper;
}
 
Example #12
Source File: FlinkReduceFunction.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public FlinkReduceFunction(Combine.KeyedCombineFn<K, ?, VA, VO> keyedCombineFn) {
	this.keyedCombineFn = keyedCombineFn;
}
 
Example #13
Source File: FlinkPartialReduceFunction.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public FlinkPartialReduceFunction(Combine.KeyedCombineFn<K, VI, VA, ?>
		                                  keyedCombineFn) {
	this.keyedCombineFn = keyedCombineFn;
}
 
Example #14
Source File: FlinkMultiOutputDoFnFunction.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(String name, Combine.CombineFn<AggInputT, ?, AggOutputT> combiner) {
	SerializableFnAggregatorWrapper<AggInputT, AggOutputT> wrapper = new SerializableFnAggregatorWrapper<>(combiner);
	getRuntimeContext().addAccumulator(name, wrapper);
	return null;
}
 
Example #15
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
public void translateNode(Combine.PerKey<K, VI, VO> transform, FlinkBatchTranslationContext context) {
	DataSet<KV<K, VI>> inputDataSet = context.getInputDataSet(context.getInput(transform));

	@SuppressWarnings("unchecked")
	Combine.KeyedCombineFn<K, VI, VA, VO> keyedCombineFn = (Combine.KeyedCombineFn<K, VI, VA, VO>) transform.getFn();

	KvCoder<K, VI> inputCoder = (KvCoder<K, VI>) context.getInput(transform).getCoder();

	Coder<VA> accumulatorCoder =
			null;
	try {
		accumulatorCoder = keyedCombineFn.getAccumulatorCoder(context.getInput(transform).getPipeline().getCoderRegistry(), inputCoder.getKeyCoder(), inputCoder.getValueCoder());
	} catch (CannotProvideCoderException e) {
		e.printStackTrace();
		// TODO
	}

	TypeInformation<KV<K, VI>> kvCoderTypeInformation = new KvCoderTypeInformation<>(inputCoder);
	TypeInformation<KV<K, VA>> partialReduceTypeInfo = new KvCoderTypeInformation<>(KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder));

	Grouping<KV<K, VI>> inputGrouping = new UnsortedGrouping<>(inputDataSet, new Keys.ExpressionKeys<>(new String[]{"key"}, kvCoderTypeInformation));

	FlinkPartialReduceFunction<K, VI, VA> partialReduceFunction = new FlinkPartialReduceFunction<>(keyedCombineFn);

	// Partially GroupReduce the values into the intermediate format VA (combine)
	GroupCombineOperator<KV<K, VI>, KV<K, VA>> groupCombine =
			new GroupCombineOperator<>(inputGrouping, partialReduceTypeInfo, partialReduceFunction,
					"GroupCombine: " + transform.getName());

	// Reduce fully to VO
	GroupReduceFunction<KV<K, VA>, KV<K, VO>> reduceFunction = new FlinkReduceFunction<>(keyedCombineFn);

	TypeInformation<KV<K, VO>> reduceTypeInfo = context.getTypeInfo(context.getOutput(transform));

	Grouping<KV<K, VA>> intermediateGrouping = new UnsortedGrouping<>(groupCombine, new Keys.ExpressionKeys<>(new String[]{"key"}, groupCombine.getType()));

	// Fully reduce the values and create output format VO
	GroupReduceOperator<KV<K, VA>, KV<K, VO>> outputDataSet =
			new GroupReduceOperator<>(intermediateGrouping, reduceTypeInfo, reduceFunction, transform.getName());

	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}
 
Example #16
Source File: FlinkStateInternals.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
private FlinkInMemoryKeyedCombiningValue(ByteString stateKey,
                                         Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn,
                                         Coder<AccumT> accumCoder,
                                         final StateContext<?> stateContext) {
	this(stateKey, withContext(combineFn), accumCoder, stateContext);
}
 
Example #17
Source File: FlinkStateInternals.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
private FlinkInMemoryKeyedCombiningValue(ByteString stateKey,
                                         Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
                                         Coder<AccumT> accumCoder,
                                         final StateContext<?> stateContext) {
	this(stateKey, withKeyAndContext(combineFn), accumCoder, stateContext);
}
 
Example #18
Source File: SerializableFnAggregatorWrapper.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
public Combine.CombineFn<AI, ?, AO> getCombineFn() {
	return combiner;
}
 
Example #19
Source File: SerializableFnAggregatorWrapper.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public SerializableFnAggregatorWrapper(Combine.CombineFn<AI, ?, AO> combiner) {
	this.combiner = combiner;
	resetLocal();
}
 
Example #20
Source File: CombineFnAggregatorWrapper.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
public Combine.CombineFn getCombineFn() {
	return combiner;
}
 
Example #21
Source File: CombineFnAggregatorWrapper.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public CombineFnAggregatorWrapper(Combine.CombineFn<? super AI, AA, AR> combiner) {
	this.combiner = combiner;
	this.aa = combiner.createAccumulator();
}
 
Example #22
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 4 votes vote down vote up
public PCollection<KV<String, TSAggValueProto>> createCompleteAggregates(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  PCollection<KV<String, TSProto>> completeWindowData =
      generateCompleteWindowData(pipeline, data, packetConfig);

  PCollection<KV<String, TSAggValueProto>> parital =
      completeWindowData.apply("CreatePartialAggregates",
          Combine.perKey(new PartialTimeSeriesAggCombiner()));

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply(
          "completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 =
      completeAggregationStage1.apply("CreateCompleteCandles",
          Combine.perKey(new CompleteTimeSeriesAggCombiner())).apply("FlattenIterables",
          ParDo.of(new FlattenKVIterableDoFn()));

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              if (c.timestamp().isBefore(new Instant(32530703764000L))) {

                if (c.timestamp().isAfter(
                    new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error("BUG There was a timestamp before current :: "
                      + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(), new Instant(c.element().getValue()
                      .getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;

}
 
Example #23
Source File: CreateAggregatesTransform.java    From data-timeseries-java with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<String, TSAggValueProto>> apply(PCollection<KV<String, TSProto>> input) {



  PCollection<KV<String, TSProto>> windowedData =
      input.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(
          FixedWindows.of(Duration.standardSeconds(options.getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues = windowedData
      .apply("DetectMissingTimeSeriesValues",
          Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
              .withoutDefaults())
      .apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
      .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues).apply("MergeGeneratedLiveValues",
          Flatten.<KV<String, TSProto>>pCollections());

  // Create partial aggregates, at this stage we will not bring forward the previous windows close
  // value
  PCollection<KV<String, TSAggValueProto>> parital = completeWindowData
      .apply("CreatePartialAggregates", Combine.perKey(new PartialTimeSeriesAggCombiner()));

  // When these aggregates go through the Global Window they will lose their time value
  // We will embed the window close into the data so we can access it later on

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  // Create a Global window which can retain the last value held in memory We must use
  // outputAtEarliestInputTimestamp as later on we re-attach the timestamp from within the data
  // point, for us not to hit 'skew' issues we need to ensure the output timestamp value is always
  // the smallest value
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply("completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 = completeAggregationStage1
      .apply("CreateCompleteCandles", Combine.perKey(new CompleteTimeSeriesAggCombiner()))
      .apply("FlattenIterables", ParDo.of(new FlattenKVIterableDoFn()));



  // Reset timestamps after global window
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              //
              // TODO When the local Dataflow runners shuts down there will be some values
              // produced for the end of the the GlobalWindow. We can remove these values by
              // filtering out anything from year 3000+ for now. Better solution will be to check
              // the WINDOW PANE
              //
          	  Instant time = c.timestamp();
          	  
              if (time.isBefore(new Instant(32530703764000L))) {

                // The timestamp produced from the Combiner after the GlobalWindow loses fidelity,
                // we can add this back by looking at the value in the data

                if (time
                    .isAfter(new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error(
                      "There was a timestamp before earlier than the window and skew must be 0 :: "
                          + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(),
                      new Instant(c.element().getValue().getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;
}