com.google.cloud.dataflow.sdk.transforms.Create Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.transforms.Create. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LoadBooksTest.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
@Test
public void parseBooks_returnsNgramsCounts() {
  // Arrange
  Pipeline p = TestPipeline.create();
  PCollection<String> input = p.apply(Create.of(testFile));

  // Act
  PCollection<KV<String, Integer>> output = LoadBooks.applyPipelineToParseBooks(input);

  // Assert
  DataflowAssert.that(output)
      .containsInAnyOrder(
          KV.of("despatch when art", 10),
          KV.of("despatch when came", 10),
          KV.of("despatch when published", 12),
          KV.of("despatch where was", 10),
          KV.of("despatch which made", 45),
          // There are two entries for "despatch which addressed".
          // Each entry has a different part of speech for "addressed".
          KV.of("despatch which addressed", 12 + 46),
          KV.of("despatch which admitted", 13),
          KV.of("despatch which allow", 14),
          KV.of("despatch which announced", 50),
          KV.of("despatch which answer", 32));
}
 
Example #2
Source File: RemoveDuplicatesEmptyITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	List<String> strings = Collections.emptyList();

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #3
Source File: WordCountJoin2ITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #4
Source File: TfIdfITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline pipeline = FlinkTestPipeline.createForBatch();

	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
			.apply(Create.of(
					KV.of(new URI("x"), "a b c d"),
					KV.of(new URI("y"), "a b c"),
					KV.of(new URI("z"), "a m n")))
			.apply(new TfIdf.ComputeTfIdf());

	PCollection<String> words = wordToUriAndTfIdf
			.apply(Keys.<String>create())
			.apply(RemoveDuplicates.<String>create());

	words.apply(TextIO.Write.to(resultPath));

	pipeline.run();
}
 
Example #5
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 6 votes vote down vote up
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}
 
Example #6
Source File: SideInputITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {


	Pipeline p = FlinkTestPipeline.createForBatch();


	final PCollectionView<String> sidesInput = p
			.apply(Create.of(expected))
			.apply(View.<String>asSingleton());

	p.apply(Create.of("bli"))
			.apply(ParDo.of(new DoFn<String, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					String s = c.sideInput(sidesInput);
					c.output(s);
				}
			}).withSideInputs(sidesInput)).apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #7
Source File: RemoveDuplicatesITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	List<String> strings = Arrays.asList("k1", "k5", "k5", "k2", "k1", "k2", "k3");

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #8
Source File: WriteSinkITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private static void runProgram(String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of(EXPECTED_RESULT)).setCoder(StringUtf8Coder.of())
		.apply("CustomSink", Write.to(new MyCustomSink(resultPath)));

	p.run();
}
 
Example #9
Source File: WordCountJoin3ITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences3 = p.apply(Create.of(WORDS_3))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.and(tag3, occurences3)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #10
Source File: WordCountITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());

	input
			.apply(new WordCount.CountWords())
			.apply(MapElements.via(new WordCount.FormatAsTextFn()))
			.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #11
Source File: MaybeEmptyTestITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
			.apply(ParDo.of(
					new DoFn<Void, String>() {
						@Override
						public void processElement(DoFn<Void, String>.ProcessContext c) {
							c.output(expected);
						}
					})).apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #12
Source File: AvroITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private static void runProgram(String tmpPath, String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p
		.apply(Create.of(
				new User("Joe", 3, "red"),
				new User("Mary", 4, "blue"),
				new User("Mark", 1, "green"),
				new User("Julia", 5, "purple"))
			.withCoder(AvroCoder.of(User.class)))

		.apply(AvroIO.Write.to(tmpPath)
			.withSchema(User.class));

	p.run();

	p = FlinkTestPipeline.createForBatch();

	p
		.apply(AvroIO.Read.from(tmpPath).withSchema(User.class).withoutValidation())

			.apply(ParDo.of(new DoFn<User, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					User u = c.element();
					String result = u.getName() + " " + u.getFavoriteColor() + " " + u.getFavoriteNumber();
					c.output(result);
				}
			}))

		.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #13
Source File: FilterOutMatchingStateTest.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterStateNoMatchingResources() {
  GCPProject checkedProject = getSampleProject("_checked");
  GCPProject liveProject = getSampleProject("_live");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(checkedProject, 1);
  GCPResourceState livePolicy = getSampleGCPResourcePolicy(liveProject, 2);
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) checkedProject, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) liveProject, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  FilterOutMatchingState function = new FilterOutMatchingState(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);
  assertEquals(0, results.size());
}
 
Example #14
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(Create.Values<OUT> transform, FlinkBatchTranslationContext context) {
	TypeInformation<OUT> typeInformation = context.getOutputTypeInfo();
	Iterable<OUT> elements = transform.getElements();

	// we need to serialize the elements to byte arrays, since they might contain
	// elements that are not serializable by Java serialization. We deserialize them
	// in the FlatMap function using the Coder.

	List<byte[]> serializedElements = Lists.newArrayList();
	Coder<OUT> coder = context.getOutput(transform).getCoder();
	for (OUT element: elements) {
		ByteArrayOutputStream bao = new ByteArrayOutputStream();
		try {
			coder.encode(element, bao, Coder.Context.OUTER);
			serializedElements.add(bao.toByteArray());
		} catch (IOException e) {
			throw new RuntimeException("Could not serialize Create elements using Coder: " + e);
		}
	}

	DataSet<Integer> initDataSet = context.getExecutionEnvironment().fromElements(1);
	FlinkCreateFunction<Integer, OUT> flatMapFunction = new FlinkCreateFunction<>(serializedElements, coder);
	FlatMapOperator<Integer, OUT> outputDataSet = new FlatMapOperator<>(initDataSet, typeInformation, flatMapFunction, transform.getName());

	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}
 
Example #15
Source File: MultiLinearGraph.java    From dockerflow with Apache License 2.0 5 votes vote down vote up
/**
 * For simple linear graphs, it's not too hard to generate the Dataflow pipeline yourself. Here's
 * the equivalent Dataflow code for this simple example.
 */
public static void manualDataflow(String[] args) throws IOException {
  LOG.info("Parsing Dataflow options");
  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(MultiLinearGraph.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  p.apply(Create.of(ArgsTableBuilder.fromArgs(args).build()))
      .apply(DockerDo.of(taskOne()))
      .apply(DockerDo.of(taskTwo()));
  p.run();
}
 
Example #16
Source File: JoinKnownGoodAndLiveStatesTest.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterStateOneMismatch() {
  GCPProject project = getSampleProject("");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(project, 1);
  GCPResourceState livePolicy = getSampleGCPResourcePolicy(project, 2);
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  JoinKnownGoodAndLiveStates function = new JoinKnownGoodAndLiveStates(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  Map<StateSource, GCPResourceState> outputMap = new HashMap<>(2);
  outputMap.put(StateSource.DESIRED, checkedPolicy);
  outputMap.put(StateSource.LIVE, livePolicy);
  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> expectedOutput =
      Arrays.asList(KV.of((GCPResource) project, outputMap));
  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);

  assertEquals(expectedOutput, results);
}
 
Example #17
Source File: JoinKnownGoodAndLiveStatesTest.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterStateNoMismatches() {
  GCPProject project = getSampleProject("");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(project, 1);
  GCPResourceState livePolicy = checkedPolicy;
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  JoinKnownGoodAndLiveStates function = new JoinKnownGoodAndLiveStates(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);
  assertEquals(1, results.size());
}
 
Example #18
Source File: JoinKnownGoodAndLiveStatesTest.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterStateNoMatchingResources() {
  GCPProject checkedProject = getSampleProject("_checked");
  GCPProject liveProject = getSampleProject("_live");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(checkedProject, 1);
  GCPResourceState livePolicy = getSampleGCPResourcePolicy(liveProject, 2);
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) checkedProject, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) liveProject, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  JoinKnownGoodAndLiveStates function = new JoinKnownGoodAndLiveStates(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);
  assertEquals(0, results.size());
}
 
Example #19
Source File: FilterOutMatchingStateTest.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterStateOneMismatch() {
  GCPProject project = getSampleProject("");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(project, 1);
  GCPResourceState livePolicy = getSampleGCPResourcePolicy(project, 2);
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  FilterOutMatchingState function = new FilterOutMatchingState(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  Map<StateSource, GCPResourceState> outputMap = new HashMap<>(2);
  outputMap.put(StateSource.DESIRED, checkedPolicy);
  outputMap.put(StateSource.LIVE, livePolicy);
  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> expectedOutput =
      Arrays.asList(KV.of((GCPResource) project, outputMap));
  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);

  assertEquals(expectedOutput, results);
}
 
Example #20
Source File: FilterOutMatchingStateTest.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterStateNoMismatches() {
  GCPProject project = getSampleProject("");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(project, 1);
  GCPResourceState livePolicy = checkedPolicy;
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  FilterOutMatchingState function = new FilterOutMatchingState(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);
  assertEquals(0, results.size());
}
 
Example #21
Source File: DataflowFactory.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
/**
 * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one
 * element, the root task's name.
 *
 * @param workflow
 * @param dataflowArgs
 * @return
 * @throws IOException
 */
public static Pipeline dataflow(
    Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o)
    throws IOException {

  assert (workflow != null);
  assert (o != null);
  assert (workflow.getDefn() != null);

  // Set defaults
  if (o.getAppName() == null) {
    o.setAppName(workflow.getDefn().getName());
  }
  if (o.getProject() == null && workflow.getArgs() != null) {
    o.setProject(workflow.getArgs().getProjectId());
  }
  if (o.getMaxNumWorkers() == 0) {
    o.setMaxNumWorkers(1);
  }
  if (o.getWorkerMachineType() == null) {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }

  LOG.info("Initializing dataflow pipeline");
  Pipeline p = Pipeline.create(o);

  LOG.info("Creating input collection of workflow args");
  if (workflowArgs == null) {
    workflowArgs = new HashMap<String, WorkflowArgs>();
  }
  if (workflowArgs.isEmpty()) {
    LOG.info("No workflow args were provided. Using default values.");
    workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs());
  } else if (workflow.getArgs() != null) {
    LOG.info("Merging default workflow args with instance-specific args");

    for (String key : workflowArgs.keySet()) {
      WorkflowArgs instanceArgs = workflowArgs.get(key);
      instanceArgs.mergeDefaultArgs(workflow.getArgs());
      LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs));
    }
  }

  LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName());
  PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs));
  input = dataflow(Workflow.Steps.graph(workflow), input);

  if (workflowArgs.values().iterator().next().getDeleteFiles()) {
    LOG.info("Intermediate files will be deleted");
    input =
        input.apply(
            ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow)));
  }

  return p;
}
 
Example #22
Source File: FlattenizeITCase.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> p1 = p.apply(Create.of(words));
	PCollection<String> p2 = p.apply(Create.of(words2));

	PCollectionList<String> list = PCollectionList.of(p1).and(p2);

	list.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath));

	PCollection<String> p3 = p.apply(Create.of(words3));

	PCollectionList<String> list2 = list.and(p3);

	list2.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath2));

	p.run();
}
 
Example #23
Source File: GatkPairedSingleSampleAlt.java    From dockerflow with Apache License 2.0 4 votes vote down vote up
/**
 * Only this one method is different from GatkPairedSingleSample.java.
 */
@Override
public Pipeline createDataflow(
    Map<String, WorkflowArgs> argsTable,
    DataflowPipelineOptions pipelineOptions,
    String[] args) throws IOException {

  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(GatkPairedSingleSampleAlt.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  // Merge sample-specific args with default workflow args
  for (String key : argsTable.keySet()) {
    WorkflowArgs instanceArgs = argsTable.get(key);
    instanceArgs.mergeDefaultArgs(workflowArgs);
  }

  // Declarations
  PCollection<KV<String, WorkflowArgs>> mainBranch, branchOne, branchTwo;
  PCollectionList<KV<String, WorkflowArgs>> mergeList;

  // Construct the workflow graph
  mainBranch  = p.apply(Create.of(argsTable));
  branchOne = mainBranch.apply(DockerDo.of(CreateSequenceGroupingTSV));
  branchTwo = mainBranch.apply(DockerDo.of(BwaVersion))
      .apply(DockerDo.of(SamToFastqAndBwaMem))
      .apply(DockerDo.of(MergeBamAlignment))
      .apply(DockerDo.of(SortAndFixReadGroupBam))
      .apply(DockerDo.of(MarkDuplicates))
      .apply(DockerDo.of(SortAndFixSampleBam));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches())
      .apply(DockerDo.of(BaseRecalibrator))
      .apply(DockerDo.of(ApplyBQSR))
      .apply(DockerDo.of(GatherBqsrReports))
      .apply(DockerDo.of(ApplyBQSRToUnmappedReads))
      .apply(DockerDo.of(GatherBamFiles));
  branchOne = mainBranch.apply(DockerDo.of(ConvertToCram));
  branchTwo = mainBranch.apply(DockerDo.of(HaplotypeCaller))
      .apply(DockerDo.of(GatherVCFs));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches());

  return p;
}
 
Example #24
Source File: ParDoMultiOutputITCase.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO"));

	// Select words whose length is below a cut off,
	// plus the lengths of words that are above the cut off.
	// Also select words starting with "MARKER".
	final int wordLengthCutOff = 3;
	// Create tags to use for the main and side outputs.
	final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){};
	final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){};
	final TupleTag<String> markedWordsTag = new TupleTag<String>(){};

	PCollectionTuple results =
			words.apply(ParDo
					.withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag)
							.and(markedWordsTag))
					.of(new DoFn<String, String>() {
						final TupleTag<String> specialWordsTag = new TupleTag<String>() {
						};

						public void processElement(ProcessContext c) {
							String word = c.element();
							if (word.length() <= wordLengthCutOff) {
								c.output(word);
							} else {
								c.sideOutput(wordLengthsAboveCutOffTag, word.length());
							}
							if (word.startsWith("MAA")) {
								c.sideOutput(markedWordsTag, word);
							}

							if (word.startsWith("SPECIAL")) {
								c.sideOutput(specialWordsTag, word);
							}
						}
					}));

	// Extract the PCollection results, by tag.
	PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag);
	PCollection<Integer> wordLengthsAboveCutOff = results.get
			(wordLengthsAboveCutOffTag);
	PCollection<String> markedWords = results.get(markedWordsTag);

	markedWords.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #25
Source File: JoinExamplesITCase.java    From flink-dataflow with Apache License 2.0 3 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
	PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

	PCollection<String> output = JoinExamples.joinEvents(input1, input2);

	output.apply(TextIO.Write.to(resultPath));

	p.run();
}