Java Code Examples for com.google.cloud.dataflow.sdk.Pipeline#apply()

The following examples show how to use com.google.cloud.dataflow.sdk.Pipeline#apply() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ExportedServiceAccountKeyRemover.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline, String org) {
  // Read projects from the CRM API.
  PCollection<GCPProject> projects =
      pipeline.apply(Read.from(new LiveProjectSource(org)));
  // List the service accounts of the projects.
  PCollection<GCPServiceAccount> serviceAccounts =
      projects.apply(ParDo.named("List Service Accounts").of(new ListServiceAccounts()));
  // List the keys of the service accounts.
  PCollection<GCPServiceAccountKey> serviceAccountKeys =
      serviceAccounts.apply(ParDo.named("List Service Account Keys")
          .of(new ListServiceAccountKeys()));
  // Construct an alert message for all the discrepancies found.
  return serviceAccountKeys.apply(ParDo
      .named("Remove user-managed keys")
      .of(new ExportedServiceAccountKeyMessenger()));
}
 
Example 2
Source File: LoadBooksTest.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
@Test
public void parseBooks_returnsNgramsCounts() {
  // Arrange
  Pipeline p = TestPipeline.create();
  PCollection<String> input = p.apply(Create.of(testFile));

  // Act
  PCollection<KV<String, Integer>> output = LoadBooks.applyPipelineToParseBooks(input);

  // Assert
  DataflowAssert.that(output)
      .containsInAnyOrder(
          KV.of("despatch when art", 10),
          KV.of("despatch when came", 10),
          KV.of("despatch when published", 12),
          KV.of("despatch where was", 10),
          KV.of("despatch which made", 45),
          // There are two entries for "despatch which addressed".
          // Each entry has a different part of speech for "addressed".
          KV.of("despatch which addressed", 12 + 46),
          KV.of("despatch which admitted", 13),
          KV.of("despatch which allow", 14),
          KV.of("despatch which announced", 50),
          KV.of("despatch which answer", 32));
}
 
Example 3
Source File: DesiredStateEnforcer.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline, String org,
    BoundedSource<KV<List<String>, String>> knownGoodSource) {
  // Read files from GCS.
  PCollection<KV<List<String>, String>> knownGoodFiles =
      pipeline.apply("Read known-good data", Read.from(knownGoodSource));
  // Convert files to GCPResourceState objects.
  PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates =
      knownGoodFiles.apply(ParDo.named("Convert file data to Java Objects")
          .of(new FileToState()));
  // Tag the state objects to indicate they're from a checked-in repo and not live.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates =
      knownGoodStates.apply(ParDo.named("Mark states as being known-good")
          .of(new TagStateWithSource(StateSource.DESIRED)));

  // Read projects from the CRM API.
  PCollection<GCPProject> allProjects =
      pipeline.apply("Read live projects", Read.from(new LiveProjectSource(org)));
  // Extract project states.
  PCollection<KV<GCPResource, GCPResourceState>> liveStates =
      allProjects
          .apply(ParDo.named("Extract project policies").of(new ExtractState()));
  // Tag the states to indicate they're live and not from a checked-in source.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates =
      liveStates.apply(ParDo.named("Mark states as being live")
          .of(new TagStateWithSource(StateSource.LIVE)));

  // Join the two known-good and the live halves.
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> knownGoodStatesView =
      taggedKnownGoodStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap());
  PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates =
      taggedLiveStates.apply(ParDo.named("Find states that don't match")
          .withSideInputs(knownGoodStatesView)
          .of(new FilterOutMatchingState(knownGoodStatesView)));

  // Construct an alert message for all the discrepancies found and fix the discrepancies.
  return mismatchedStates
      .apply(ParDo.named("Fix discrepancies").of(discrepancyAutoFixMessenger));
}
 
Example 4
Source File: OnDemandLiveStateChecker.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline,
    BoundedSource<KV<List<String>, String>> knownGoodSource) {
  // Read files from GCS.
  PCollection<KV<List<String>, String>> knownGoodFiles =
      pipeline.apply("Read known-good data", Read.from(knownGoodSource));
  // Convert files to GCPResourceState objects.
  PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates =
      knownGoodFiles.apply(ParDo.named("Convert file data to Java objects")
          .of(new FileToState()));
  // Tag the state objects to indicate they're from a checked-in repo and not live.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates =
      knownGoodStates.apply(ParDo.named("Mark states as being known-good")
          .of(new TagStateWithSource(StateSource.DESIRED)));

  // Extract a list of checked-in projects from GCS.
  PCollection<List<String>> allFilePaths = knownGoodFiles
      .apply("Extract just the file paths", ParDo.of(new FilePathFromPair()));
  // Read the live version of the states of the checked-in projects.
  PCollection<KV<GCPResource, GCPResourceState>> liveStates =
      allFilePaths.apply(ParDo.named("Get live resource and states from file path")
          .of(new FilePathToLiveState()));
  // Tag the states to indicate they're live and not from a checked-in source.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates =
      liveStates.apply(ParDo.named("Mark states as being live")
          .of(new TagStateWithSource(StateSource.LIVE)));

  // Join the two known-good and the live halves.
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> liveStatesView =
      taggedLiveStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap());
  PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates =
      taggedKnownGoodStates.apply(ParDo.named("Find states that don't match")
          .withSideInputs(liveStatesView)
          .of(new FilterOutMatchingState(liveStatesView)));
  // Construct an alert message for all the discrepancies found.
  return mismatchedStates.apply(ParDo
      .named("Generate notification messages")
      .of(new StateDiscrepancyMessenger()));
}
 
Example 5
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline p = Pipeline.create(options);
	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
	PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
	PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
	formattedResults.apply(TextIO.Write.to(options.getOutput()));
	p.run();
}
 
Example 6
Source File: GatkPairedSingleSampleAlt.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
/**
 * Only this one method is different from GatkPairedSingleSample.java.
 */
@Override
public Pipeline createDataflow(
    Map<String, WorkflowArgs> argsTable,
    DataflowPipelineOptions pipelineOptions,
    String[] args) throws IOException {

  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(GatkPairedSingleSampleAlt.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  // Merge sample-specific args with default workflow args
  for (String key : argsTable.keySet()) {
    WorkflowArgs instanceArgs = argsTable.get(key);
    instanceArgs.mergeDefaultArgs(workflowArgs);
  }

  // Declarations
  PCollection<KV<String, WorkflowArgs>> mainBranch, branchOne, branchTwo;
  PCollectionList<KV<String, WorkflowArgs>> mergeList;

  // Construct the workflow graph
  mainBranch  = p.apply(Create.of(argsTable));
  branchOne = mainBranch.apply(DockerDo.of(CreateSequenceGroupingTSV));
  branchTwo = mainBranch.apply(DockerDo.of(BwaVersion))
      .apply(DockerDo.of(SamToFastqAndBwaMem))
      .apply(DockerDo.of(MergeBamAlignment))
      .apply(DockerDo.of(SortAndFixReadGroupBam))
      .apply(DockerDo.of(MarkDuplicates))
      .apply(DockerDo.of(SortAndFixSampleBam));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches())
      .apply(DockerDo.of(BaseRecalibrator))
      .apply(DockerDo.of(ApplyBQSR))
      .apply(DockerDo.of(GatherBqsrReports))
      .apply(DockerDo.of(ApplyBQSRToUnmappedReads))
      .apply(DockerDo.of(GatherBamFiles));
  branchOne = mainBranch.apply(DockerDo.of(ConvertToCram));
  branchTwo = mainBranch.apply(DockerDo.of(HaplotypeCaller))
      .apply(DockerDo.of(GatherVCFs));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches());

  return p;
}
 
Example 7
Source File: DataflowFactory.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
/**
 * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one
 * element, the root task's name.
 *
 * @param workflow
 * @param dataflowArgs
 * @return
 * @throws IOException
 */
public static Pipeline dataflow(
    Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o)
    throws IOException {

  assert (workflow != null);
  assert (o != null);
  assert (workflow.getDefn() != null);

  // Set defaults
  if (o.getAppName() == null) {
    o.setAppName(workflow.getDefn().getName());
  }
  if (o.getProject() == null && workflow.getArgs() != null) {
    o.setProject(workflow.getArgs().getProjectId());
  }
  if (o.getMaxNumWorkers() == 0) {
    o.setMaxNumWorkers(1);
  }
  if (o.getWorkerMachineType() == null) {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }

  LOG.info("Initializing dataflow pipeline");
  Pipeline p = Pipeline.create(o);

  LOG.info("Creating input collection of workflow args");
  if (workflowArgs == null) {
    workflowArgs = new HashMap<String, WorkflowArgs>();
  }
  if (workflowArgs.isEmpty()) {
    LOG.info("No workflow args were provided. Using default values.");
    workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs());
  } else if (workflow.getArgs() != null) {
    LOG.info("Merging default workflow args with instance-specific args");

    for (String key : workflowArgs.keySet()) {
      WorkflowArgs instanceArgs = workflowArgs.get(key);
      instanceArgs.mergeDefaultArgs(workflow.getArgs());
      LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs));
    }
  }

  LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName());
  PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs));
  input = dataflow(Workflow.Steps.graph(workflow), input);

  if (workflowArgs.values().iterator().next().getDeleteFiles()) {
    LOG.info("Intermediate files will be deleted");
    input =
        input.apply(
            ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow)));
  }

  return p;
}
 
Example 8
Source File: FlattenizeITCase.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> p1 = p.apply(Create.of(words));
	PCollection<String> p2 = p.apply(Create.of(words2));

	PCollectionList<String> list = PCollectionList.of(p1).and(p2);

	list.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath));

	PCollection<String> p3 = p.apply(Create.of(words3));

	PCollectionList<String> list2 = list.and(p3);

	list2.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath2));

	p.run();
}
 
Example 9
Source File: ParDoMultiOutputITCase.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO"));

	// Select words whose length is below a cut off,
	// plus the lengths of words that are above the cut off.
	// Also select words starting with "MARKER".
	final int wordLengthCutOff = 3;
	// Create tags to use for the main and side outputs.
	final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){};
	final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){};
	final TupleTag<String> markedWordsTag = new TupleTag<String>(){};

	PCollectionTuple results =
			words.apply(ParDo
					.withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag)
							.and(markedWordsTag))
					.of(new DoFn<String, String>() {
						final TupleTag<String> specialWordsTag = new TupleTag<String>() {
						};

						public void processElement(ProcessContext c) {
							String word = c.element();
							if (word.length() <= wordLengthCutOff) {
								c.output(word);
							} else {
								c.sideOutput(wordLengthsAboveCutOffTag, word.length());
							}
							if (word.startsWith("MAA")) {
								c.sideOutput(markedWordsTag, word);
							}

							if (word.startsWith("SPECIAL")) {
								c.sideOutput(specialWordsTag, word);
							}
						}
					}));

	// Extract the PCollection results, by tag.
	PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag);
	PCollection<Integer> wordLengthsAboveCutOff = results.get
			(wordLengthsAboveCutOffTag);
	PCollection<String> markedWords = results.get(markedWordsTag);

	markedWords.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example 10
Source File: JoinExamplesITCase.java    From flink-dataflow with Apache License 2.0 3 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
	PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

	PCollection<String> output = JoinExamples.joinEvents(input1, input2);

	output.apply(TextIO.Write.to(resultPath));

	p.run();
}