com.google.cloud.dataflow.sdk.transforms.DoFn Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.transforms.DoFn. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TFIDF.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
	return wordToUriAndTfIdf
			.apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
				private static final long serialVersionUID = 0;

				@Override
				public void processElement(ProcessContext c) {
					c.output(String.format("%s,\t%s,\t%f",
							c.element().getKey(),
							c.element().getValue().getKey(),
							c.element().getValue().getValue()));
				}
			}))
			.apply(TextIO.Write
					.to(output)
					.withSuffix(".csv"));
}
 
Example #2
Source File: UnboundedSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}
 
Example #3
Source File: DockerDo.java    From dockerflow with Apache License 2.0 6 votes vote down vote up
@Override
public void processElement(
    DoFn<KV<String, Wrapper>, KV<String, WorkflowArgs>>.ProcessContext c) throws Exception {

  LOG.info("Combining args");

  Wrapper value = c.element().getValue();
  WorkflowArgs retval = null;

  // Iterate in order
  for (WorkflowArgs wa : value.map.values()) {

    // Modify a copy
    if (retval == null) {
      retval = new WorkflowArgs(wa);
    // Find differences and merge
    } else {
      retval.gatherArgs(wa);
    }
  }
  c.output(KV.of(c.element().getKey(), retval));
}
 
Example #4
Source File: FlinkGroupAlsoByWindowWrapper.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
/**
 * Create the adequate {@link com.google.cloud.dataflow.sdk.util.GroupAlsoByWindowsDoFn},
 * <b> if not already created</b>.
 * If a {@link com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn} was provided, then
 * a function with that combiner is created, so that elements are combined as they arrive. This is
 * done for speed and (in most of the cases) for reduction of the per-window state.
 */
private <W extends BoundedWindow> DoFn<KeyedWorkItem<K, VIN>, KV<K, VOUT>> createGroupAlsoByWindowOperator() {
	if (this.operator == null) {
		if (this.combineFn == null) {
			// Thus VOUT == Iterable<VIN>
			Coder<VIN> inputValueCoder = inputKvCoder.getValueCoder();

			this.operator = (DoFn) GroupAlsoByWindowViaWindowSetDoFn.create(
					(WindowingStrategy<?, W>) this.windowingStrategy, SystemReduceFn.<K, VIN, W>buffering(inputValueCoder));
		} else {
			Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();

			AppliedCombineFn<K, VIN, VACC, VOUT> appliedCombineFn = AppliedCombineFn
					.withInputCoder(combineFn, coderRegistry, inputKvCoder);

			this.operator = GroupAlsoByWindowViaWindowSetDoFn.create(
					(WindowingStrategy<?, W>) this.windowingStrategy, SystemReduceFn.<K, VIN, VACC, VOUT, W>combining(inputKeyCoder, appliedCombineFn));
		}
	}
	return this.operator;
}
 
Example #5
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 6 votes vote down vote up
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}
 
Example #6
Source File: ReadSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}
 
Example #7
Source File: SideInputITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {


	Pipeline p = FlinkTestPipeline.createForBatch();


	final PCollectionView<String> sidesInput = p
			.apply(Create.of(expected))
			.apply(View.<String>asSingleton());

	p.apply(Create.of("bli"))
			.apply(ParDo.of(new DoFn<String, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					String s = c.sideInput(sidesInput);
					c.output(s);
				}
			}).withSideInputs(sidesInput)).apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #8
Source File: AvroITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private static void runProgram(String tmpPath, String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p
		.apply(Create.of(
				new User("Joe", 3, "red"),
				new User("Mary", 4, "blue"),
				new User("Mark", 1, "green"),
				new User("Julia", 5, "purple"))
			.withCoder(AvroCoder.of(User.class)))

		.apply(AvroIO.Write.to(tmpPath)
			.withSchema(User.class));

	p.run();

	p = FlinkTestPipeline.createForBatch();

	p
		.apply(AvroIO.Read.from(tmpPath).withSchema(User.class).withoutValidation())

			.apply(ParDo.of(new DoFn<User, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					User u = c.element();
					String result = u.getName() + " " + u.getFavoriteColor() + " " + u.getFavoriteNumber();
					c.output(result);
				}
			}))

		.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #9
Source File: AbstractFlinkTimerInternals.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public void encodeTimerInternals(DoFn.ProcessContext context,
                                 StateCheckpointWriter writer,
                                 KvCoder<K, VIN> kvCoder,
                                 Coder<? extends BoundedWindow> windowCoder) throws IOException {
	if (context == null) {
		throw new RuntimeException("The Context has not been initialized.");
	}

	writer.setTimestamp(currentInputWatermark);
	writer.setTimestamp(currentOutputWatermark);
}
 
Example #10
Source File: FlinkGroupAlsoByWindowWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public ProcessContext(DoFn<KeyedWorkItem<K, VIN>, KV<K, VOUT>> function,
                      TimestampedCollector<WindowedValue<KV<K, VOUT>>> outCollector,
                      FlinkTimerInternals timerInternals) {
	function.super();
	super.setupDelegateAggregators();

	this.collector = Preconditions.checkNotNull(outCollector);
	this.timerInternals = Preconditions.checkNotNull(timerInternals);
}
 
Example #11
Source File: EmbedWindowTimeIntoAggregateDoFn.java    From data-timeseries-java with Apache License 2.0 5 votes vote down vote up
@Override
public void processElement(
    DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
    throws Exception {

  c.output(KV.of(c.element().getKey(), TSAggValueProto.newBuilder(c.element().getValue())
      .setCloseTime(c.window().maxTimestamp().getMillis()).build()));

}
 
Example #12
Source File: FlattenKVIterableDoFn.java    From data-timeseries-java with Apache License 2.0 5 votes vote down vote up
@Override
public void processElement(
    DoFn<KV<String, List<TSAggValueProto>>, KV<String, TSAggValueProto>>.ProcessContext c)
    throws Exception {
  for (TSAggValueProto candle : c.element().getValue()) {

    c.output(KV.of(c.element().getKey(), candle));

  }

}
 
Example #13
Source File: MaybeEmptyTestITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
			.apply(ParDo.of(
					new DoFn<Void, String>() {
						@Override
						public void processElement(DoFn<Void, String>.ProcessContext c) {
							c.output(expected);
						}
					})).apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #14
Source File: FlinkAbstractParDoWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public FlinkAbstractParDoWrapper(PipelineOptions options, WindowingStrategy<?, ?> windowingStrategy, DoFn<IN, OUTDF> doFn) {
	Preconditions.checkNotNull(options);
	Preconditions.checkNotNull(windowingStrategy);
	Preconditions.checkNotNull(doFn);

	this.doFn = doFn;
	this.options = options;
	this.windowingStrategy = windowingStrategy;
}
 
Example #15
Source File: DockerDo.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
@Override
public void processElement(
    DoFn<KV<String, WorkflowArgs>, KV<String, WorkflowArgs>>.ProcessContext c)
    throws Exception {
  WorkflowArgs wa = new WorkflowArgs(c.element().getValue());
  wa.setCurrentOperation(null);
  c.output(KV.of(c.element().getKey(), wa));
}
 
Example #16
Source File: FlinkAbstractParDoWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public BoundedWindow window() {
	if (!(fn instanceof DoFn.RequiresWindowAccess)) {
		throw new UnsupportedOperationException(
				"window() is only available in the context of a DoFn marked as RequiresWindow.");
	}

	Collection<? extends BoundedWindow> windows = this.element.getWindows();
	if (windows.size() != 1) {
		throw new IllegalArgumentException("Each element is expected to belong to 1 window. " +
				"This belongs to " + windows.size() + ".");
	}
	return windows.iterator().next();
}
 
Example #17
Source File: FlinkAbstractParDoWrapper.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private DoFnProcessContext(DoFn<IN, OUTDF> function, Collector<WindowedValue<OUTFL>> outCollector) {
	function.super();
	super.setupDelegateAggregators();

	this.fn = function;
	this.collector = outCollector;
}
 
Example #18
Source File: FlinkDoFnFunction.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public ProcessContext(DoFn<IN, OUT> fn, Collector<OUT> outCollector) {
	fn.super();
	super.setupDelegateAggregators();
	this.outCollector = outCollector;
}
 
Example #19
Source File: FlinkParDoBoundWrapper.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public FlinkParDoBoundWrapper(PipelineOptions options, WindowingStrategy<?, ?> windowingStrategy, DoFn<IN, OUT> doFn) {
	super(options, windowingStrategy, doFn);
}
 
Example #20
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
public void translateNode(ParDo.BoundMulti<IN, OUT> transform, FlinkBatchTranslationContext context) {
	DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform));

	final DoFn<IN, OUT> doFn = transform.getFn();

	Map<TupleTag<?>, PCollection<?>> outputs = context.getOutput(transform).getAll();

	Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
	// put the main output at index 0, FlinkMultiOutputDoFnFunction also expects this
	outputMap.put(transform.getMainOutputTag(), 0);
	int count = 1;
	for (TupleTag<?> tag: outputs.keySet()) {
		if (!outputMap.containsKey(tag)) {
			outputMap.put(tag, count++);
		}
	}

	// collect all output Coders and create a UnionCoder for our tagged outputs
	List<Coder<?>> outputCoders = Lists.newArrayList();
	for (PCollection<?> coll: outputs.values()) {
		outputCoders.add(coll.getCoder());
	}

	UnionCoder unionCoder = UnionCoder.of(outputCoders);

	@SuppressWarnings("unchecked")
	TypeInformation<RawUnionValue> typeInformation = new CoderTypeInformation<>(unionCoder);

	@SuppressWarnings("unchecked")
	FlinkMultiOutputDoFnFunction<IN, OUT> doFnWrapper = new FlinkMultiOutputDoFnFunction(doFn, context.getPipelineOptions(), outputMap);
	MapPartitionOperator<IN, RawUnionValue> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName());

	transformSideInputs(transform.getSideInputs(), outputDataSet, context);

	for (Map.Entry<TupleTag<?>, PCollection<?>> output: outputs.entrySet()) {
		TypeInformation<Object> outputType = context.getTypeInfo(output.getValue());
		int outputTag = outputMap.get(output.getKey());
		FlinkMultiOutputPruningFunction<Object> pruningFunction = new FlinkMultiOutputPruningFunction<>(outputTag);
		FlatMapOperator<RawUnionValue, Object> pruningOperator = new
				FlatMapOperator<>(outputDataSet, outputType,
				pruningFunction, output.getValue().getName());
		context.setOutputDataSet(output.getValue(), pruningOperator);

	}
}
 
Example #21
Source File: FlinkMultiOutputDoFnFunction.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public FlinkMultiOutputDoFnFunction(DoFn<IN, OUT> doFn, PipelineOptions options, Map<TupleTag<?>, Integer> outputMap) {
	this.doFn = doFn;
	this.options = options;
	this.outputMap = outputMap;
}
 
Example #22
Source File: FlinkMultiOutputDoFnFunction.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public ProcessContext(DoFn<IN, OUT> fn, Collector<RawUnionValue> outCollector) {
	fn.super();
	this.outCollector = outCollector;
}
 
Example #23
Source File: FlinkDoFnFunction.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public FlinkDoFnFunction(DoFn<IN, OUT> doFn, PipelineOptions options) {
	this.doFn = doFn;
	this.options = options;
}
 
Example #24
Source File: CoinbaseSource.java    From cloud-bigtable-examples with Apache License 2.0 4 votes vote down vote up
@Override
public void finishBundle(DoFn<CoinbaseData, Mutation>.Context c) throws Exception {
  super.finishBundle(c);
}
 
Example #25
Source File: ParDoMultiOutputITCase.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO"));

	// Select words whose length is below a cut off,
	// plus the lengths of words that are above the cut off.
	// Also select words starting with "MARKER".
	final int wordLengthCutOff = 3;
	// Create tags to use for the main and side outputs.
	final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){};
	final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){};
	final TupleTag<String> markedWordsTag = new TupleTag<String>(){};

	PCollectionTuple results =
			words.apply(ParDo
					.withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag)
							.and(markedWordsTag))
					.of(new DoFn<String, String>() {
						final TupleTag<String> specialWordsTag = new TupleTag<String>() {
						};

						public void processElement(ProcessContext c) {
							String word = c.element();
							if (word.length() <= wordLengthCutOff) {
								c.output(word);
							} else {
								c.sideOutput(wordLengthsAboveCutOffTag, word.length());
							}
							if (word.startsWith("MAA")) {
								c.sideOutput(markedWordsTag, word);
							}

							if (word.startsWith("SPECIAL")) {
								c.sideOutput(specialWordsTag, word);
							}
						}
					}));

	// Extract the PCollection results, by tag.
	PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag);
	PCollection<Integer> wordLengthsAboveCutOff = results.get
			(wordLengthsAboveCutOffTag);
	PCollection<String> markedWords = results.get(markedWordsTag);

	markedWords.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #26
Source File: CoinbaseSource.java    From cloud-bigtable-examples with Apache License 2.0 4 votes vote down vote up
@Override
public void startBundle(DoFn<CoinbaseData, Mutation>.Context c) throws Exception {
  super.startBundle(c);
}
 
Example #27
Source File: BreakFusion.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
@Override
public void processElement(DoFn<T, KV<String, T>>.ProcessContext c) throws Exception {
  c.output(KV.of(String.valueOf(c.element().hashCode()), c.element()));
}
 
Example #28
Source File: FlinkAbstractParDoWrapper.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
private void initContext(DoFn<IN, OUTDF> function, Collector<WindowedValue<OUTFL>> outCollector) {
	if (this.context == null) {
		this.context = new DoFnProcessContext(function, outCollector);
	}
}
 
Example #29
Source File: FlinkParDoBoundMultiWrapper.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public FlinkParDoBoundMultiWrapper(PipelineOptions options, WindowingStrategy<?, ?> windowingStrategy, DoFn<IN, OUT> doFn, TupleTag<?> mainTag, Map<TupleTag<?>, Integer> tagsToLabels) {
	super(options, windowingStrategy, doFn);
	this.mainTag = Preconditions.checkNotNull(mainTag);
	this.outputLabels = Preconditions.checkNotNull(tagsToLabels);
}
 
Example #30
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 4 votes vote down vote up
@org.junit.Test
public void testCompleteCandleDataOneStream() {

  Pipeline pipeline = setup();

  List<KV<String, TSProto>> pipelineData = GenerateSampleData.getTestData();
  WorkPacketConfig packetConfig =
      GenerateSampleData.generateWorkPacketConfig(2, new String[] {GenerateSampleData.TS1});

  Map<String, TSProto> map = generateMapData(pipelineData);

  // Run test with TS-1 data only

  List<KV<String, TSProto>> ts1Only = new ArrayList<>();

  for (String ts : map.keySet()) {
    if (extractKey(ts).equals(GenerateSampleData.TS1)) {
      ts1Only.add(KV.of(extractKey(ts), map.get(ts)));
    }
  }

  List<KV<String, TSProto>> testData = new ArrayList<KV<String, TSProto>>(ts1Only);

  PCollection<KV<String, TSAggValueProto>> completeAggs =
      createCompleteAggregates(pipeline, ts1Only, packetConfig);

  PCollection<SimpleAggTester> simpleAgg =
      completeAggs.apply(ParDo.of(new DoFn<KV<String, TSAggValueProto>, SimpleAggTester>() {

        @Override
        public void processElement(
            DoFn<KV<String, TSAggValueProto>, SimpleAggTester>.ProcessContext c) throws Exception {

          c.output(SimpleAggTester.newBuilder().setKey(c.element().getKey())
              .setCloseTime(c.element().getValue().getCloseTime())
              .setOpenStateTime(c.element().getValue().getOpenState().getTime())
              .setCloseStateTime(c.element().getValue().getCloseState().getTime())
              .setMinAskPrice(c.element().getValue().getMinAskValue().getAskPrice())
              .setMaxAskPrice(c.element().getValue().getMaxAskValue().getAskPrice())
              .setMinBidPrice(c.element().getValue().getMinBidValue().getBidPrice())
              .setMaxBidPrice(c.element().getValue().getMaxBidValue().getBidPrice()).build());

        }

      }));

  List<SimpleAggTester> expectedList = new ArrayList<>();

  String key = GenerateSampleData.TS1;

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577719999L)
          .setOpenStateTime(1451577660000L).setCloseStateTime(1451577660000L).setMinAskPrice(1)
          .setMaxAskPrice(2).setMinBidPrice(1).setMaxBidPrice(2).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577839999L)
          .setOpenStateTime(1451577660000L).setCloseStateTime(1451577780000L).setMinAskPrice(3)
          .setMaxAskPrice(4).setMinBidPrice(3).setMaxBidPrice(4).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577959999L)
          .setOpenStateTime(1451577780000L).setCloseStateTime(1451577900000L).setMinAskPrice(5)
          .setMaxAskPrice(5).setMinBidPrice(5).setMaxBidPrice(5).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451578079999L)
          .setOpenStateTime(1451577900000L).setCloseStateTime(1451578020000L).setMinAskPrice(3)
          .setMaxAskPrice(4).setMinBidPrice(3).setMaxBidPrice(4).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451578199999L)
          .setOpenStateTime(1451578020000L).setCloseStateTime(1451578140000L).setMinAskPrice(1)
          .setMaxAskPrice(2).setMinBidPrice(1).setMaxBidPrice(2).build());

  
    
  DataflowAssert.that(simpleAgg).containsInAnyOrder(expectedList);

  pipeline.run();
}