com.google.cloud.dataflow.sdk.transforms.Create Java Exaples

Source File: LoadBooksTest.java From cloud-bigtable-examples with Apache License 2.0

6 votes

@Test
public void parseBooks_returnsNgramsCounts() {
  // Arrange
  Pipeline p = TestPipeline.create();
  PCollection<String> input = p.apply(Create.of(testFile));

  // Act
  PCollection<KV<String, Integer>> output = LoadBooks.applyPipelineToParseBooks(input);

  // Assert
  DataflowAssert.that(output)
      .containsInAnyOrder(
          KV.of("despatch when art", 10),
          KV.of("despatch when came", 10),
          KV.of("despatch when published", 12),
          KV.of("despatch where was", 10),
          KV.of("despatch which made", 45),
          // There are two entries for "despatch which addressed".
          // Each entry has a different part of speech for "addressed".
          KV.of("despatch which addressed", 12 + 46),
          KV.of("despatch which admitted", 13),
          KV.of("despatch which allow", 14),
          KV.of("despatch which announced", 50),
          KV.of("despatch which answer", 32));
}

Source File: RemoveDuplicatesEmptyITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	List<String> strings = Collections.emptyList();

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: WordCountJoin2ITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}

Source File: TfIdfITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	Pipeline pipeline = FlinkTestPipeline.createForBatch();

	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
			.apply(Create.of(
					KV.of(new URI("x"), "a b c d"),
					KV.of(new URI("y"), "a b c"),
					KV.of(new URI("z"), "a m n")))
			.apply(new TfIdf.ComputeTfIdf());

	PCollection<String> words = wordToUriAndTfIdf
			.apply(Keys.<String>create())
			.apply(RemoveDuplicates.<String>create());

	words.apply(TextIO.Write.to(resultPath));

	pipeline.run();
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

6 votes

public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}

Source File: SideInputITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {


	Pipeline p = FlinkTestPipeline.createForBatch();


	final PCollectionView<String> sidesInput = p
			.apply(Create.of(expected))
			.apply(View.<String>asSingleton());

	p.apply(Create.of("bli"))
			.apply(ParDo.of(new DoFn<String, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					String s = c.sideInput(sidesInput);
					c.output(s);
				}
			}).withSideInputs(sidesInput)).apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: RemoveDuplicatesITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	List<String> strings = Arrays.asList("k1", "k5", "k5", "k2", "k1", "k2", "k3");

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: WriteSinkITCase.java From flink-dataflow with Apache License 2.0

5 votes

private static void runProgram(String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of(EXPECTED_RESULT)).setCoder(StringUtf8Coder.of())
		.apply("CustomSink", Write.to(new MyCustomSink(resultPath)));

	p.run();
}

Source File: WordCountJoin3ITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences3 = p.apply(Create.of(WORDS_3))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.and(tag3, occurences3)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}

Source File: WordCountITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());

	input
			.apply(new WordCount.CountWords())
			.apply(MapElements.via(new WordCount.FormatAsTextFn()))
			.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: MaybeEmptyTestITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
			.apply(ParDo.of(
					new DoFn<Void, String>() {
						@Override
						public void processElement(DoFn<Void, String>.ProcessContext c) {
							c.output(expected);
						}
					})).apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: AvroITCase.java From flink-dataflow with Apache License 2.0

5 votes

private static void runProgram(String tmpPath, String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p
		.apply(Create.of(
				new User("Joe", 3, "red"),
				new User("Mary", 4, "blue"),
				new User("Mark", 1, "green"),
				new User("Julia", 5, "purple"))
			.withCoder(AvroCoder.of(User.class)))

		.apply(AvroIO.Write.to(tmpPath)
			.withSchema(User.class));

	p.run();

	p = FlinkTestPipeline.createForBatch();

	p
		.apply(AvroIO.Read.from(tmpPath).withSchema(User.class).withoutValidation())

			.apply(ParDo.of(new DoFn<User, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					User u = c.element();
					String result = u.getName() + " " + u.getFavoriteColor() + " " + u.getFavoriteNumber();
					c.output(result);
				}
			}))

		.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: FilterOutMatchingStateTest.java From policyscanner with Apache License 2.0

5 votes

@Test
public void testFilterStateNoMatchingResources() {
  GCPProject checkedProject = getSampleProject("_checked");
  GCPProject liveProject = getSampleProject("_live");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(checkedProject, 1);
  GCPResourceState livePolicy = getSampleGCPResourcePolicy(liveProject, 2);
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) checkedProject, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) liveProject, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  FilterOutMatchingState function = new FilterOutMatchingState(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);
  assertEquals(0, results.size());
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(Create.Values<OUT> transform, FlinkBatchTranslationContext context) {
	TypeInformation<OUT> typeInformation = context.getOutputTypeInfo();
	Iterable<OUT> elements = transform.getElements();

	// we need to serialize the elements to byte arrays, since they might contain
	// elements that are not serializable by Java serialization. We deserialize them
	// in the FlatMap function using the Coder.

	List<byte[]> serializedElements = Lists.newArrayList();
	Coder<OUT> coder = context.getOutput(transform).getCoder();
	for (OUT element: elements) {
		ByteArrayOutputStream bao = new ByteArrayOutputStream();
		try {
			coder.encode(element, bao, Coder.Context.OUTER);
			serializedElements.add(bao.toByteArray());
		} catch (IOException e) {
			throw new RuntimeException("Could not serialize Create elements using Coder: " + e);
		}
	}

	DataSet<Integer> initDataSet = context.getExecutionEnvironment().fromElements(1);
	FlinkCreateFunction<Integer, OUT> flatMapFunction = new FlinkCreateFunction<>(serializedElements, coder);
	FlatMapOperator<Integer, OUT> outputDataSet = new FlatMapOperator<>(initDataSet, typeInformation, flatMapFunction, transform.getName());

	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}

Source File: MultiLinearGraph.java From dockerflow with Apache License 2.0

5 votes

/**
 * For simple linear graphs, it's not too hard to generate the Dataflow pipeline yourself. Here's
 * the equivalent Dataflow code for this simple example.
 */
public static void manualDataflow(String[] args) throws IOException {
  LOG.info("Parsing Dataflow options");
  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(MultiLinearGraph.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  p.apply(Create.of(ArgsTableBuilder.fromArgs(args).build()))
      .apply(DockerDo.of(taskOne()))
      .apply(DockerDo.of(taskTwo()));
  p.run();
}

Source File: JoinKnownGoodAndLiveStatesTest.java From policyscanner with Apache License 2.0

5 votes

@Test
public void testFilterStateOneMismatch() {
  GCPProject project = getSampleProject("");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(project, 1);
  GCPResourceState livePolicy = getSampleGCPResourcePolicy(project, 2);
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  JoinKnownGoodAndLiveStates function = new JoinKnownGoodAndLiveStates(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  Map<StateSource, GCPResourceState> outputMap = new HashMap<>(2);
  outputMap.put(StateSource.DESIRED, checkedPolicy);
  outputMap.put(StateSource.LIVE, livePolicy);
  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> expectedOutput =
      Arrays.asList(KV.of((GCPResource) project, outputMap));
  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);

  assertEquals(expectedOutput, results);
}

Source File: JoinKnownGoodAndLiveStatesTest.java From policyscanner with Apache License 2.0

5 votes

@Test
public void testFilterStateNoMismatches() {
  GCPProject project = getSampleProject("");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(project, 1);
  GCPResourceState livePolicy = checkedPolicy;
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  JoinKnownGoodAndLiveStates function = new JoinKnownGoodAndLiveStates(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);
  assertEquals(1, results.size());
}

Source File: JoinKnownGoodAndLiveStatesTest.java From policyscanner with Apache License 2.0

5 votes

@Test
public void testFilterStateNoMatchingResources() {
  GCPProject checkedProject = getSampleProject("_checked");
  GCPProject liveProject = getSampleProject("_live");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(checkedProject, 1);
  GCPResourceState livePolicy = getSampleGCPResourcePolicy(liveProject, 2);
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) checkedProject, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) liveProject, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  JoinKnownGoodAndLiveStates function = new JoinKnownGoodAndLiveStates(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);
  assertEquals(0, results.size());
}

Source File: FilterOutMatchingStateTest.java From policyscanner with Apache License 2.0

5 votes

@Test
public void testFilterStateOneMismatch() {
  GCPProject project = getSampleProject("");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(project, 1);
  GCPResourceState livePolicy = getSampleGCPResourcePolicy(project, 2);
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  FilterOutMatchingState function = new FilterOutMatchingState(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  Map<StateSource, GCPResourceState> outputMap = new HashMap<>(2);
  outputMap.put(StateSource.DESIRED, checkedPolicy);
  outputMap.put(StateSource.LIVE, livePolicy);
  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> expectedOutput =
      Arrays.asList(KV.of((GCPResource) project, outputMap));
  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);

  assertEquals(expectedOutput, results);
}

Source File: FilterOutMatchingStateTest.java From policyscanner with Apache License 2.0

5 votes

@Test
public void testFilterStateNoMismatches() {
  GCPProject project = getSampleProject("");
  GCPResourceState checkedPolicy = getSampleGCPResourcePolicy(project, 1);
  GCPResourceState livePolicy = checkedPolicy;
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> sideInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.DESIRED, checkedPolicy)));
  List<KV<GCPResource, KV<StateSource, GCPResourceState>>> mainInputList =
      Arrays.asList(KV.of((GCPResource) project, KV.of(StateSource.LIVE, livePolicy)));

  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> collection =
      pipeline.apply(Create.of(sideInputList)).setCoder(
          KvCoder.of(SerializableCoder.of(GCPResource.class),
              KvCoder.of(SerializableCoder.of(StateSource.class),
                  SerializableCoder.of(GCPResourceState.class))));
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> view =
      View.<GCPResource, KV<StateSource, GCPResourceState>>asMap().apply(collection);

  FilterOutMatchingState function = new FilterOutMatchingState(view);
  DoFnTester<KV<GCPResource, KV<StateSource, GCPResourceState>>,
      KV<GCPResource, Map<StateSource, GCPResourceState>>> tester = DoFnTester.of(function);
  tester.setSideInputInGlobalWindow(view, sideInputList);

  List<KV<GCPResource, Map<StateSource, GCPResourceState>>> results =
      tester.processBatch(mainInputList);
  assertEquals(0, results.size());
}

Source File: DataflowFactory.java From dockerflow with Apache License 2.0

4 votes

/**
 * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one
 * element, the root task's name.
 *
 * @param workflow
 * @param dataflowArgs
 * @return
 * @throws IOException
 */
public static Pipeline dataflow(
    Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o)
    throws IOException {

  assert (workflow != null);
  assert (o != null);
  assert (workflow.getDefn() != null);

  // Set defaults
  if (o.getAppName() == null) {
    o.setAppName(workflow.getDefn().getName());
  }
  if (o.getProject() == null && workflow.getArgs() != null) {
    o.setProject(workflow.getArgs().getProjectId());
  }
  if (o.getMaxNumWorkers() == 0) {
    o.setMaxNumWorkers(1);
  }
  if (o.getWorkerMachineType() == null) {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }

  LOG.info("Initializing dataflow pipeline");
  Pipeline p = Pipeline.create(o);

  LOG.info("Creating input collection of workflow args");
  if (workflowArgs == null) {
    workflowArgs = new HashMap<String, WorkflowArgs>();
  }
  if (workflowArgs.isEmpty()) {
    LOG.info("No workflow args were provided. Using default values.");
    workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs());
  } else if (workflow.getArgs() != null) {
    LOG.info("Merging default workflow args with instance-specific args");

    for (String key : workflowArgs.keySet()) {
      WorkflowArgs instanceArgs = workflowArgs.get(key);
      instanceArgs.mergeDefaultArgs(workflow.getArgs());
      LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs));
    }
  }

  LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName());
  PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs));
  input = dataflow(Workflow.Steps.graph(workflow), input);

  if (workflowArgs.values().iterator().next().getDeleteFiles()) {
    LOG.info("Intermediate files will be deleted");
    input =
        input.apply(
            ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow)));
  }

  return p;
}

Source File: FlattenizeITCase.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> p1 = p.apply(Create.of(words));
	PCollection<String> p2 = p.apply(Create.of(words2));

	PCollectionList<String> list = PCollectionList.of(p1).and(p2);

	list.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath));

	PCollection<String> p3 = p.apply(Create.of(words3));

	PCollectionList<String> list2 = list.and(p3);

	list2.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath2));

	p.run();
}

Source File: GatkPairedSingleSampleAlt.java From dockerflow with Apache License 2.0

4 votes

/**
 * Only this one method is different from GatkPairedSingleSample.java.
 */
@Override
public Pipeline createDataflow(
    Map<String, WorkflowArgs> argsTable,
    DataflowPipelineOptions pipelineOptions,
    String[] args) throws IOException {

  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(GatkPairedSingleSampleAlt.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  // Merge sample-specific args with default workflow args
  for (String key : argsTable.keySet()) {
    WorkflowArgs instanceArgs = argsTable.get(key);
    instanceArgs.mergeDefaultArgs(workflowArgs);
  }

  // Declarations
  PCollection<KV<String, WorkflowArgs>> mainBranch, branchOne, branchTwo;
  PCollectionList<KV<String, WorkflowArgs>> mergeList;

  // Construct the workflow graph
  mainBranch  = p.apply(Create.of(argsTable));
  branchOne = mainBranch.apply(DockerDo.of(CreateSequenceGroupingTSV));
  branchTwo = mainBranch.apply(DockerDo.of(BwaVersion))
      .apply(DockerDo.of(SamToFastqAndBwaMem))
      .apply(DockerDo.of(MergeBamAlignment))
      .apply(DockerDo.of(SortAndFixReadGroupBam))
      .apply(DockerDo.of(MarkDuplicates))
      .apply(DockerDo.of(SortAndFixSampleBam));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches())
      .apply(DockerDo.of(BaseRecalibrator))
      .apply(DockerDo.of(ApplyBQSR))
      .apply(DockerDo.of(GatherBqsrReports))
      .apply(DockerDo.of(ApplyBQSRToUnmappedReads))
      .apply(DockerDo.of(GatherBamFiles));
  branchOne = mainBranch.apply(DockerDo.of(ConvertToCram));
  branchTwo = mainBranch.apply(DockerDo.of(HaplotypeCaller))
      .apply(DockerDo.of(GatherVCFs));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches());

  return p;
}

Source File: ParDoMultiOutputITCase.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO"));

	// Select words whose length is below a cut off,
	// plus the lengths of words that are above the cut off.
	// Also select words starting with "MARKER".
	final int wordLengthCutOff = 3;
	// Create tags to use for the main and side outputs.
	final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){};
	final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){};
	final TupleTag<String> markedWordsTag = new TupleTag<String>(){};

	PCollectionTuple results =
			words.apply(ParDo
					.withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag)
							.and(markedWordsTag))
					.of(new DoFn<String, String>() {
						final TupleTag<String> specialWordsTag = new TupleTag<String>() {
						};

						public void processElement(ProcessContext c) {
							String word = c.element();
							if (word.length() <= wordLengthCutOff) {
								c.output(word);
							} else {
								c.sideOutput(wordLengthsAboveCutOffTag, word.length());
							}
							if (word.startsWith("MAA")) {
								c.sideOutput(markedWordsTag, word);
							}

							if (word.startsWith("SPECIAL")) {
								c.sideOutput(specialWordsTag, word);
							}
						}
					}));

	// Extract the PCollection results, by tag.
	PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag);
	PCollection<Integer> wordLengthsAboveCutOff = results.get
			(wordLengthsAboveCutOffTag);
	PCollection<String> markedWords = results.get(markedWordsTag);

	markedWords.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: JoinExamplesITCase.java From flink-dataflow with Apache License 2.0

3 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
	PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

	PCollection<String> output = JoinExamples.joinEvents(input1, input2);

	output.apply(TextIO.Write.to(resultPath));

	p.run();
}

com.google.cloud.dataflow.sdk.transforms.Create Java Examples