com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DataflowFactory.java    From dockerflow with Apache License 2.0 6 votes vote down vote up
/**
 * Create Dataflow Pipeline options from the standard command-line options, "--project=",
 * "--runner=" and "--stagingLocation="
 *
 * @param args
 * @return
 * @throws IOException
 */
public static DataflowPipelineOptions pipelineOptions(String[] args) throws IOException {
  LOG.info("Set up Dataflow options");
  DataflowPipelineOptions o = PipelineOptionsFactory.as(DataflowPipelineOptions.class);

  Map<String, String> m = StringUtils.parseArgs(args);
  o.setProject(m.get(PROJECT));
  if (m.containsKey(STAGING)) {
    o.setStagingLocation(m.get(STAGING));
  } else if (m.containsKey(STAGING_LOCATION)) {
    o.setStagingLocation(m.get(STAGING_LOCATION));
  } else if (m.containsKey(WORKSPACE)) {
    o.setStagingLocation(m.get(WORKSPACE) + "/staging");
  }
  o.setRunner(runner(m.get(RUNNER)));
  o.setMaxNumWorkers(m.get(MAX_WORKERS) == null ? 1 : Integer.parseInt(m.get(MAX_WORKERS)));
  if (m.containsKey(MACHINE_TYPE)) {
    o.setWorkerMachineType(m.get(MACHINE_TYPE));
  } else {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }
  return o;
}
 
Example #2
Source File: TaskRunner.java    From dockerflow with Apache License 2.0 6 votes vote down vote up
/** Run a Docker workflow on Dataflow. */
public static void run(Workflow w, Map<String, WorkflowArgs> a, DataflowPipelineOptions o)
    throws IOException {
  LOG.info("Running workflow graph");
  if (w.getArgs().getProjectId() == null) {
    throw new IllegalArgumentException("Project id is required");
  }

  Pipeline p = DataflowFactory.dataflow(w, a, o);

  LOG.info("Created Dataflow pipeline");
  LOG.debug(w.toString());

  PipelineResult r = p.run();

  LOG.info("Dataflow pipeline completed");
  LOG.info("Result state: " + r.getState());
}
 
Example #3
Source File: LiveStateCheckerApp.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(Constants.PROJECT_ID);
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}
 
Example #4
Source File: UserManagedKeysApp.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}
 
Example #5
Source File: LiveStateCheckerRunner.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private static PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}
 
Example #6
Source File: DesiredStateEnforcerApp.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}
 
Example #7
Source File: WorkflowDefn.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
/**
 * The workflow defn implementation is responsible for defining the workflow steps and default
 * args, and creating a Dataflow pipeline.
 *
 * @throws URISyntaxException
 */
default Pipeline createDataflow(
    Map<String, WorkflowArgs> argsTable, DataflowPipelineOptions pipelineOptions, String[] args)
    throws IOException {
  return DataflowBuilder.of(createWorkflow(args))
      .createFrom(argsTable)
      .pipelineOptions(pipelineOptions)
      .build();
}
 
Example #8
Source File: MultiLinearGraph.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
/**
 * For simple linear graphs, it's not too hard to generate the Dataflow pipeline yourself. Here's
 * the equivalent Dataflow code for this simple example.
 */
public static void manualDataflow(String[] args) throws IOException {
  LOG.info("Parsing Dataflow options");
  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(MultiLinearGraph.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  p.apply(Create.of(ArgsTableBuilder.fromArgs(args).build()))
      .apply(DockerDo.of(taskOne()))
      .apply(DockerDo.of(taskTwo()));
  p.run();
}
 
Example #9
Source File: GatkPairedSingleSampleAlt.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
/**
 * Only this one method is different from GatkPairedSingleSample.java.
 */
@Override
public Pipeline createDataflow(
    Map<String, WorkflowArgs> argsTable,
    DataflowPipelineOptions pipelineOptions,
    String[] args) throws IOException {

  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(GatkPairedSingleSampleAlt.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  // Merge sample-specific args with default workflow args
  for (String key : argsTable.keySet()) {
    WorkflowArgs instanceArgs = argsTable.get(key);
    instanceArgs.mergeDefaultArgs(workflowArgs);
  }

  // Declarations
  PCollection<KV<String, WorkflowArgs>> mainBranch, branchOne, branchTwo;
  PCollectionList<KV<String, WorkflowArgs>> mergeList;

  // Construct the workflow graph
  mainBranch  = p.apply(Create.of(argsTable));
  branchOne = mainBranch.apply(DockerDo.of(CreateSequenceGroupingTSV));
  branchTwo = mainBranch.apply(DockerDo.of(BwaVersion))
      .apply(DockerDo.of(SamToFastqAndBwaMem))
      .apply(DockerDo.of(MergeBamAlignment))
      .apply(DockerDo.of(SortAndFixReadGroupBam))
      .apply(DockerDo.of(MarkDuplicates))
      .apply(DockerDo.of(SortAndFixSampleBam));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches())
      .apply(DockerDo.of(BaseRecalibrator))
      .apply(DockerDo.of(ApplyBQSR))
      .apply(DockerDo.of(GatherBqsrReports))
      .apply(DockerDo.of(ApplyBQSRToUnmappedReads))
      .apply(DockerDo.of(GatherBamFiles));
  branchOne = mainBranch.apply(DockerDo.of(ConvertToCram));
  branchTwo = mainBranch.apply(DockerDo.of(HaplotypeCaller))
      .apply(DockerDo.of(GatherVCFs));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches());

  return p;
}
 
Example #10
Source File: DataflowFactory.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
/**
 * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one
 * element, the root task's name.
 *
 * @param workflow
 * @param dataflowArgs
 * @return
 * @throws IOException
 */
public static Pipeline dataflow(
    Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o)
    throws IOException {

  assert (workflow != null);
  assert (o != null);
  assert (workflow.getDefn() != null);

  // Set defaults
  if (o.getAppName() == null) {
    o.setAppName(workflow.getDefn().getName());
  }
  if (o.getProject() == null && workflow.getArgs() != null) {
    o.setProject(workflow.getArgs().getProjectId());
  }
  if (o.getMaxNumWorkers() == 0) {
    o.setMaxNumWorkers(1);
  }
  if (o.getWorkerMachineType() == null) {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }

  LOG.info("Initializing dataflow pipeline");
  Pipeline p = Pipeline.create(o);

  LOG.info("Creating input collection of workflow args");
  if (workflowArgs == null) {
    workflowArgs = new HashMap<String, WorkflowArgs>();
  }
  if (workflowArgs.isEmpty()) {
    LOG.info("No workflow args were provided. Using default values.");
    workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs());
  } else if (workflow.getArgs() != null) {
    LOG.info("Merging default workflow args with instance-specific args");

    for (String key : workflowArgs.keySet()) {
      WorkflowArgs instanceArgs = workflowArgs.get(key);
      instanceArgs.mergeDefaultArgs(workflow.getArgs());
      LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs));
    }
  }

  LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName());
  PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs));
  input = dataflow(Workflow.Steps.graph(workflow), input);

  if (workflowArgs.values().iterator().next().getDeleteFiles()) {
    LOG.info("Intermediate files will be deleted");
    input =
        input.apply(
            ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow)));
  }

  return p;
}
 
Example #11
Source File: DataflowBuilder.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
public DataflowBuilder pipelineOptions(DataflowPipelineOptions options) {
  pipelineOptions = options;
  pipelineOptions.setAppName(workflow.getDefn().getName());
  return this;
}
 
Example #12
Source File: FlinkPipelineOptions.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
/**
 * The job name is used to identify jobs running on a Flink cluster.
 */
@Description("Dataflow job name, to uniquely identify active jobs. "
		+ "Defaults to using the ApplicationName-UserName-Date.")
@Default.InstanceFactory(DataflowPipelineOptions.JobNameFactory.class)
String getJobName();