com.google.cloud.dataflow.sdk.transforms.ParDo Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.transforms.ParDo. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FilterRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("filter lower Manhattan", ParDo.of(new FilterLowerManhattan()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}
 
Example #2
Source File: CoinbaseSource.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CloudBigtableOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CloudBigtableOptions.class);

  CloudBigtableScanConfiguration config =
      new CloudBigtableScanConfiguration.Builder()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId())
          .build();

  options.setStreaming(true);
  options.setRunner(DataflowPipelineRunner.class);

  Pipeline p = Pipeline.create(options);
  CloudBigtableIO.initializeForWrite(p);

  p.apply(Read.from(new CoinbaseSource()))
      .apply(ParDo.named("DeserializeCoinbase").of(new DeserializeCoinbase()))
      .apply(ParDo.of(new HBaseBigtableWriter()))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run();
}
 
Example #3
Source File: LoadBooks.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
  CloudBigtableTableConfiguration config =
      CloudBigtableTableConfiguration.fromCBTOptions(options);

  Pipeline p = Pipeline.create(options);

  CloudBigtableIO.initializeForWrite(p);

  PCollection<KV<String, Integer>> ngrams =
      applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
  PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
  mutations.apply(CloudBigtableIO.writeToTable(config));

  // Run the pipeline.
  p.run();
}
 
Example #4
Source File: UnboundedSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}
 
Example #5
Source File: WordCountJoin2ITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #6
Source File: SideInputITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {


	Pipeline p = FlinkTestPipeline.createForBatch();


	final PCollectionView<String> sidesInput = p
			.apply(Create.of(expected))
			.apply(View.<String>asSingleton());

	p.apply(Create.of("bli"))
			.apply(ParDo.of(new DoFn<String, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					String s = c.sideInput(sidesInput);
					c.output(s);
				}
			}).withSideInputs(sidesInput)).apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #7
Source File: ReadSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}
 
Example #8
Source File: TFIDF.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
	return wordToUriAndTfIdf
			.apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
				private static final long serialVersionUID = 0;

				@Override
				public void processElement(ProcessContext c) {
					c.output(String.format("%s,\t%s,\t%f",
							c.element().getKey(),
							c.element().getValue().getKey(),
							c.element().getValue().getValue()));
				}
			}))
			.apply(TextIO.Write
					.to(output)
					.withSuffix(".csv"));
}
 
Example #9
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 6 votes vote down vote up
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}
 
Example #10
Source File: ExportedServiceAccountKeyRemover.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline, String org) {
  // Read projects from the CRM API.
  PCollection<GCPProject> projects =
      pipeline.apply(Read.from(new LiveProjectSource(org)));
  // List the service accounts of the projects.
  PCollection<GCPServiceAccount> serviceAccounts =
      projects.apply(ParDo.named("List Service Accounts").of(new ListServiceAccounts()));
  // List the keys of the service accounts.
  PCollection<GCPServiceAccountKey> serviceAccountKeys =
      serviceAccounts.apply(ParDo.named("List Service Account Keys")
          .of(new ListServiceAccountKeys()));
  // Construct an alert message for all the discrepancies found.
  return serviceAccountKeys.apply(ParDo
      .named("Remove user-managed keys")
      .of(new ExportedServiceAccountKeyMessenger()));
}
 
Example #11
Source File: ExactDollarRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("extract dollars",
      MapElements.via((TableRow x) -> Double.parseDouble(x.get("meter_increment").toString()))
        .withOutputType(TypeDescriptor.of(Double.class)))

   .apply("fixed window", Window.into(FixedWindows.of(Duration.standardMinutes(1))))
   .apply("trigger",
      Window.<Double>triggering(
        AfterWatermark.pastEndOfWindow()
          .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.standardSeconds(1)))
          .withLateFirings(AfterPane.elementCountAtLeast(1)))
        .accumulatingFiredPanes()
        .withAllowedLateness(Duration.standardMinutes(5)))

   .apply("sum whole window", Sum.doublesGlobally().withoutDefaults())
   .apply("format rides", ParDo.of(new TransformRides()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}
 
Example #12
Source File: DesiredStateEnforcer.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline, String org,
    BoundedSource<KV<List<String>, String>> knownGoodSource) {
  // Read files from GCS.
  PCollection<KV<List<String>, String>> knownGoodFiles =
      pipeline.apply("Read known-good data", Read.from(knownGoodSource));
  // Convert files to GCPResourceState objects.
  PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates =
      knownGoodFiles.apply(ParDo.named("Convert file data to Java Objects")
          .of(new FileToState()));
  // Tag the state objects to indicate they're from a checked-in repo and not live.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates =
      knownGoodStates.apply(ParDo.named("Mark states as being known-good")
          .of(new TagStateWithSource(StateSource.DESIRED)));

  // Read projects from the CRM API.
  PCollection<GCPProject> allProjects =
      pipeline.apply("Read live projects", Read.from(new LiveProjectSource(org)));
  // Extract project states.
  PCollection<KV<GCPResource, GCPResourceState>> liveStates =
      allProjects
          .apply(ParDo.named("Extract project policies").of(new ExtractState()));
  // Tag the states to indicate they're live and not from a checked-in source.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates =
      liveStates.apply(ParDo.named("Mark states as being live")
          .of(new TagStateWithSource(StateSource.LIVE)));

  // Join the two known-good and the live halves.
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> knownGoodStatesView =
      taggedKnownGoodStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap());
  PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates =
      taggedLiveStates.apply(ParDo.named("Find states that don't match")
          .withSideInputs(knownGoodStatesView)
          .of(new FilterOutMatchingState(knownGoodStatesView)));

  // Construct an alert message for all the discrepancies found and fix the discrepancies.
  return mismatchedStates
      .apply(ParDo.named("Fix discrepancies").of(discrepancyAutoFixMessenger));
}
 
Example #13
Source File: DockerDo.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollection<KV<String, WorkflowArgs>> input) {
  return input
      .apply(ParDo.named("Prepare").of(new Gather(task)))
      .apply(Combine.perKey(new SortArgs()))
      .apply(ParDo.named("CombineOutputs").of(new CombineArgs()));
}
 
Example #14
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 5 votes vote down vote up
public PCollection<KV<String, TSProto>> generateCompleteWindowData(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  LOG.info("Check to see that time streams with missing 'ticks' have been corrected");

  PCollection<KV<String, TSProto>> tsData = setupDataInput(pipeline, data);


  PCollection<KV<String, TSProto>> windowedData =
      tsData.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(FixedWindows
          .of(Duration.standardSeconds(((FXTimeSeriesPipelineOptions) pipeline.getOptions())
              .getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues =
      windowedData
          .apply(
              "DetectMissingTimeSeriesValues",
              Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
                  .withoutDefaults()).apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
          .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues)
          .apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections());


  return completeWindowData;
}
 
Example #15
Source File: WordCountJoin3ITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences3 = p.apply(Create.of(WORDS_3))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.and(tag3, occurences3)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #16
Source File: DockerDo.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollection<KV<String, WorkflowArgs>> input) {
  PCollection<KV<String, WorkflowArgs>> pc = input;

  if (attempt == 0) {
    pc = pc.apply(ParDo.named("Prepare").of(new ClearOperationStatus()));
  }

  return pc.apply(ParDo.named("Start").of(new StartTask(task, attempt)))
      .apply(new BreakFusion<KV<String, WorkflowArgs>>("AfterStarted"))
      .apply(ParDo.named("Wait").of(new WaitForOperation()));
}
 
Example #17
Source File: MaybeEmptyTestITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
			.apply(ParDo.of(
					new DoFn<Void, String>() {
						@Override
						public void processElement(DoFn<Void, String>.ProcessContext c) {
							c.output(expected);
						}
					})).apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #18
Source File: OnDemandLiveStateChecker.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline,
    BoundedSource<KV<List<String>, String>> knownGoodSource) {
  // Read files from GCS.
  PCollection<KV<List<String>, String>> knownGoodFiles =
      pipeline.apply("Read known-good data", Read.from(knownGoodSource));
  // Convert files to GCPResourceState objects.
  PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates =
      knownGoodFiles.apply(ParDo.named("Convert file data to Java objects")
          .of(new FileToState()));
  // Tag the state objects to indicate they're from a checked-in repo and not live.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates =
      knownGoodStates.apply(ParDo.named("Mark states as being known-good")
          .of(new TagStateWithSource(StateSource.DESIRED)));

  // Extract a list of checked-in projects from GCS.
  PCollection<List<String>> allFilePaths = knownGoodFiles
      .apply("Extract just the file paths", ParDo.of(new FilePathFromPair()));
  // Read the live version of the states of the checked-in projects.
  PCollection<KV<GCPResource, GCPResourceState>> liveStates =
      allFilePaths.apply(ParDo.named("Get live resource and states from file path")
          .of(new FilePathToLiveState()));
  // Tag the states to indicate they're live and not from a checked-in source.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates =
      liveStates.apply(ParDo.named("Mark states as being live")
          .of(new TagStateWithSource(StateSource.LIVE)));

  // Join the two known-good and the live halves.
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> liveStatesView =
      taggedLiveStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap());
  PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates =
      taggedKnownGoodStates.apply(ParDo.named("Find states that don't match")
          .withSideInputs(liveStatesView)
          .of(new FilterOutMatchingState(liveStatesView)));
  // Construct an alert message for all the discrepancies found.
  return mismatchedStates.apply(ParDo
      .named("Generate notification messages")
      .of(new StateDiscrepancyMessenger()));
}
 
Example #19
Source File: AvroITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private static void runProgram(String tmpPath, String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p
		.apply(Create.of(
				new User("Joe", 3, "red"),
				new User("Mary", 4, "blue"),
				new User("Mark", 1, "green"),
				new User("Julia", 5, "purple"))
			.withCoder(AvroCoder.of(User.class)))

		.apply(AvroIO.Write.to(tmpPath)
			.withSchema(User.class));

	p.run();

	p = FlinkTestPipeline.createForBatch();

	p
		.apply(AvroIO.Read.from(tmpPath).withSchema(User.class).withoutValidation())

			.apply(ParDo.of(new DoFn<User, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					User u = c.element();
					String result = u.getName() + " " + u.getFavoriteColor() + " " + u.getFavoriteNumber();
					c.output(result);
				}
			}))

		.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #20
Source File: BreakFusion.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> apply(PCollection<T> input) {
  return input
      .apply(ParDo.named("BreakFusion").of(new DummyMapFn<T>()))
      .apply(Combine.<String, T>perKey(new First<T>()))
      .apply(Values.<T>create());
}
 
Example #21
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
public void translateNode(ParDo.BoundMulti<IN, OUT> transform, FlinkBatchTranslationContext context) {
	DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform));

	final DoFn<IN, OUT> doFn = transform.getFn();

	Map<TupleTag<?>, PCollection<?>> outputs = context.getOutput(transform).getAll();

	Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
	// put the main output at index 0, FlinkMultiOutputDoFnFunction also expects this
	outputMap.put(transform.getMainOutputTag(), 0);
	int count = 1;
	for (TupleTag<?> tag: outputs.keySet()) {
		if (!outputMap.containsKey(tag)) {
			outputMap.put(tag, count++);
		}
	}

	// collect all output Coders and create a UnionCoder for our tagged outputs
	List<Coder<?>> outputCoders = Lists.newArrayList();
	for (PCollection<?> coll: outputs.values()) {
		outputCoders.add(coll.getCoder());
	}

	UnionCoder unionCoder = UnionCoder.of(outputCoders);

	@SuppressWarnings("unchecked")
	TypeInformation<RawUnionValue> typeInformation = new CoderTypeInformation<>(unionCoder);

	@SuppressWarnings("unchecked")
	FlinkMultiOutputDoFnFunction<IN, OUT> doFnWrapper = new FlinkMultiOutputDoFnFunction(doFn, context.getPipelineOptions(), outputMap);
	MapPartitionOperator<IN, RawUnionValue> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName());

	transformSideInputs(transform.getSideInputs(), outputDataSet, context);

	for (Map.Entry<TupleTag<?>, PCollection<?>> output: outputs.entrySet()) {
		TypeInformation<Object> outputType = context.getTypeInfo(output.getValue());
		int outputTag = outputMap.get(output.getKey());
		FlinkMultiOutputPruningFunction<Object> pruningFunction = new FlinkMultiOutputPruningFunction<>(outputTag);
		FlatMapOperator<RawUnionValue, Object> pruningOperator = new
				FlatMapOperator<>(outputDataSet, outputType,
				pruningFunction, output.getValue().getName());
		context.setOutputDataSet(output.getValue(), pruningOperator);

	}
}
 
Example #22
Source File: ParDoMultiOutputITCase.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO"));

	// Select words whose length is below a cut off,
	// plus the lengths of words that are above the cut off.
	// Also select words starting with "MARKER".
	final int wordLengthCutOff = 3;
	// Create tags to use for the main and side outputs.
	final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){};
	final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){};
	final TupleTag<String> markedWordsTag = new TupleTag<String>(){};

	PCollectionTuple results =
			words.apply(ParDo
					.withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag)
							.and(markedWordsTag))
					.of(new DoFn<String, String>() {
						final TupleTag<String> specialWordsTag = new TupleTag<String>() {
						};

						public void processElement(ProcessContext c) {
							String word = c.element();
							if (word.length() <= wordLengthCutOff) {
								c.output(word);
							} else {
								c.sideOutput(wordLengthsAboveCutOffTag, word.length());
							}
							if (word.startsWith("MAA")) {
								c.sideOutput(markedWordsTag, word);
							}

							if (word.startsWith("SPECIAL")) {
								c.sideOutput(specialWordsTag, word);
							}
						}
					}));

	// Extract the PCollection results, by tag.
	PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag);
	PCollection<Integer> wordLengthsAboveCutOff = results.get
			(wordLengthsAboveCutOffTag);
	PCollection<String> markedWords = results.get(markedWordsTag);

	markedWords.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #23
Source File: DockerDo.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
/** Scatter on the inputs to run multiple Docker tasks in parallel. */
public static ParDo.Bound<KV<String, WorkflowArgs>, KV<String, WorkflowArgs>> scatter(Task t) {
  return ParDo.named("Scatter").of(new ScatterTasks(t));
}
 
Example #24
Source File: DockerDo.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollection<KV<String, WorkflowArgs>> input) {
  return input.apply(ParDo.of(new Outputs(task)));
}
 
Example #25
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 4 votes vote down vote up
@org.junit.Test
public void testCompleteCandleDataOneStream() {

  Pipeline pipeline = setup();

  List<KV<String, TSProto>> pipelineData = GenerateSampleData.getTestData();
  WorkPacketConfig packetConfig =
      GenerateSampleData.generateWorkPacketConfig(2, new String[] {GenerateSampleData.TS1});

  Map<String, TSProto> map = generateMapData(pipelineData);

  // Run test with TS-1 data only

  List<KV<String, TSProto>> ts1Only = new ArrayList<>();

  for (String ts : map.keySet()) {
    if (extractKey(ts).equals(GenerateSampleData.TS1)) {
      ts1Only.add(KV.of(extractKey(ts), map.get(ts)));
    }
  }

  List<KV<String, TSProto>> testData = new ArrayList<KV<String, TSProto>>(ts1Only);

  PCollection<KV<String, TSAggValueProto>> completeAggs =
      createCompleteAggregates(pipeline, ts1Only, packetConfig);

  PCollection<SimpleAggTester> simpleAgg =
      completeAggs.apply(ParDo.of(new DoFn<KV<String, TSAggValueProto>, SimpleAggTester>() {

        @Override
        public void processElement(
            DoFn<KV<String, TSAggValueProto>, SimpleAggTester>.ProcessContext c) throws Exception {

          c.output(SimpleAggTester.newBuilder().setKey(c.element().getKey())
              .setCloseTime(c.element().getValue().getCloseTime())
              .setOpenStateTime(c.element().getValue().getOpenState().getTime())
              .setCloseStateTime(c.element().getValue().getCloseState().getTime())
              .setMinAskPrice(c.element().getValue().getMinAskValue().getAskPrice())
              .setMaxAskPrice(c.element().getValue().getMaxAskValue().getAskPrice())
              .setMinBidPrice(c.element().getValue().getMinBidValue().getBidPrice())
              .setMaxBidPrice(c.element().getValue().getMaxBidValue().getBidPrice()).build());

        }

      }));

  List<SimpleAggTester> expectedList = new ArrayList<>();

  String key = GenerateSampleData.TS1;

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577719999L)
          .setOpenStateTime(1451577660000L).setCloseStateTime(1451577660000L).setMinAskPrice(1)
          .setMaxAskPrice(2).setMinBidPrice(1).setMaxBidPrice(2).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577839999L)
          .setOpenStateTime(1451577660000L).setCloseStateTime(1451577780000L).setMinAskPrice(3)
          .setMaxAskPrice(4).setMinBidPrice(3).setMaxBidPrice(4).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577959999L)
          .setOpenStateTime(1451577780000L).setCloseStateTime(1451577900000L).setMinAskPrice(5)
          .setMaxAskPrice(5).setMinBidPrice(5).setMaxBidPrice(5).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451578079999L)
          .setOpenStateTime(1451577900000L).setCloseStateTime(1451578020000L).setMinAskPrice(3)
          .setMaxAskPrice(4).setMinBidPrice(3).setMaxBidPrice(4).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451578199999L)
          .setOpenStateTime(1451578020000L).setCloseStateTime(1451578140000L).setMinAskPrice(1)
          .setMaxAskPrice(2).setMinBidPrice(1).setMaxBidPrice(2).build());

  
    
  DataflowAssert.that(simpleAgg).containsInAnyOrder(expectedList);

  pipeline.run();
}
 
Example #26
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 4 votes vote down vote up
@org.junit.Test
public void testDataInput() {

  Pipeline pipeline = setup();

  PCollection<KV<String, TSProto>> tsData =
      setupDataInput(pipeline, GenerateSampleData.getTestData());

  LOG.info("Check that we have 42 elements in the Input PCollection");

  DataflowAssert.that(
      tsData.apply("TestInputElementCount", ParDo.of(new DoFn<KV<String, TSProto>, Integer>() {

        @Override
        public void processElement(DoFn<KV<String, TSProto>, Integer>.ProcessContext c)
            throws Exception {

          c.output(1);
        }

      })).apply(Sum.integersGlobally())).containsInAnyOrder(42);

  pipeline.run();

}
 
Example #27
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 4 votes vote down vote up
public PCollection<KV<String, TSAggValueProto>> createCompleteAggregates(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  PCollection<KV<String, TSProto>> completeWindowData =
      generateCompleteWindowData(pipeline, data, packetConfig);

  PCollection<KV<String, TSAggValueProto>> parital =
      completeWindowData.apply("CreatePartialAggregates",
          Combine.perKey(new PartialTimeSeriesAggCombiner()));

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply(
          "completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 =
      completeAggregationStage1.apply("CreateCompleteCandles",
          Combine.perKey(new CompleteTimeSeriesAggCombiner())).apply("FlattenIterables",
          ParDo.of(new FlattenKVIterableDoFn()));

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              if (c.timestamp().isBefore(new Instant(32530703764000L))) {

                if (c.timestamp().isAfter(
                    new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error("BUG There was a timestamp before current :: "
                      + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(), new Instant(c.element().getValue()
                      .getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;

}
 
Example #28
Source File: CreateAggregatesTransform.java    From data-timeseries-java with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<String, TSAggValueProto>> apply(PCollection<KV<String, TSProto>> input) {



  PCollection<KV<String, TSProto>> windowedData =
      input.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(
          FixedWindows.of(Duration.standardSeconds(options.getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues = windowedData
      .apply("DetectMissingTimeSeriesValues",
          Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
              .withoutDefaults())
      .apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
      .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues).apply("MergeGeneratedLiveValues",
          Flatten.<KV<String, TSProto>>pCollections());

  // Create partial aggregates, at this stage we will not bring forward the previous windows close
  // value
  PCollection<KV<String, TSAggValueProto>> parital = completeWindowData
      .apply("CreatePartialAggregates", Combine.perKey(new PartialTimeSeriesAggCombiner()));

  // When these aggregates go through the Global Window they will lose their time value
  // We will embed the window close into the data so we can access it later on

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  // Create a Global window which can retain the last value held in memory We must use
  // outputAtEarliestInputTimestamp as later on we re-attach the timestamp from within the data
  // point, for us not to hit 'skew' issues we need to ensure the output timestamp value is always
  // the smallest value
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply("completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 = completeAggregationStage1
      .apply("CreateCompleteCandles", Combine.perKey(new CompleteTimeSeriesAggCombiner()))
      .apply("FlattenIterables", ParDo.of(new FlattenKVIterableDoFn()));



  // Reset timestamps after global window
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              //
              // TODO When the local Dataflow runners shuts down there will be some values
              // produced for the end of the the GlobalWindow. We can remove these values by
              // filtering out anything from year 3000+ for now. Better solution will be to check
              // the WINDOW PANE
              //
          	  Instant time = c.timestamp();
          	  
              if (time.isBefore(new Instant(32530703764000L))) {

                // The timestamp produced from the Combiner after the GlobalWindow loses fidelity,
                // we can add this back by looking at the value in the data

                if (time
                    .isAfter(new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error(
                      "There was a timestamp before earlier than the window and skew must be 0 :: "
                          + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(),
                      new Instant(c.element().getValue().getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;
}
 
Example #29
Source File: DataflowFactory.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
/**
 * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one
 * element, the root task's name.
 *
 * @param workflow
 * @param dataflowArgs
 * @return
 * @throws IOException
 */
public static Pipeline dataflow(
    Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o)
    throws IOException {

  assert (workflow != null);
  assert (o != null);
  assert (workflow.getDefn() != null);

  // Set defaults
  if (o.getAppName() == null) {
    o.setAppName(workflow.getDefn().getName());
  }
  if (o.getProject() == null && workflow.getArgs() != null) {
    o.setProject(workflow.getArgs().getProjectId());
  }
  if (o.getMaxNumWorkers() == 0) {
    o.setMaxNumWorkers(1);
  }
  if (o.getWorkerMachineType() == null) {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }

  LOG.info("Initializing dataflow pipeline");
  Pipeline p = Pipeline.create(o);

  LOG.info("Creating input collection of workflow args");
  if (workflowArgs == null) {
    workflowArgs = new HashMap<String, WorkflowArgs>();
  }
  if (workflowArgs.isEmpty()) {
    LOG.info("No workflow args were provided. Using default values.");
    workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs());
  } else if (workflow.getArgs() != null) {
    LOG.info("Merging default workflow args with instance-specific args");

    for (String key : workflowArgs.keySet()) {
      WorkflowArgs instanceArgs = workflowArgs.get(key);
      instanceArgs.mergeDefaultArgs(workflow.getArgs());
      LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs));
    }
  }

  LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName());
  PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs));
  input = dataflow(Workflow.Steps.graph(workflow), input);

  if (workflowArgs.values().iterator().next().getDeleteFiles()) {
    LOG.info("Intermediate files will be deleted");
    input =
        input.apply(
            ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow)));
  }

  return p;
}
 
Example #30
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 3 votes vote down vote up
@Override
public void translateNode(ParDo.Bound<IN, OUT> transform, FlinkBatchTranslationContext context) {
	DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform));

	final DoFn<IN, OUT> doFn = transform.getFn();

	TypeInformation<OUT> typeInformation = context.getTypeInfo(context.getOutput(transform));

	FlinkDoFnFunction<IN, OUT> doFnWrapper = new FlinkDoFnFunction<>(doFn, context.getPipelineOptions());
	MapPartitionOperator<IN, OUT> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName());

	transformSideInputs(transform.getSideInputs(), outputDataSet, context);

	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}