Java Code Examples for org.apache.beam.runners.dataflow.options.DataflowPipelineOptions#setStreaming()

The following examples show how to use org.apache.beam.runners.dataflow.options.DataflowPipelineOptions#setStreaming() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGcsUploadBufferSizeIsSetForStreamingWhenDefault() throws IOException {
  DataflowPipelineOptions streamingOptions = buildPipelineOptions();
  streamingOptions.setStreaming(true);
  streamingOptions.setRunner(DataflowRunner.class);
  Pipeline p = Pipeline.create(streamingOptions);

  // Instantiation of a runner prior to run() currently has a side effect of mutating the options.
  // This could be tested by DataflowRunner.fromOptions(streamingOptions) but would not ensure
  // that the pipeline itself had the expected options set.
  p.run();

  assertEquals(
      DataflowRunner.GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT,
      streamingOptions.getGcsUploadBufferSizeBytes().intValue());
}
 
Example 2
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGcsUploadBufferSizeUnchangedWhenNotDefault() throws IOException {
  int gcsUploadBufferSizeBytes = 12345678;
  DataflowPipelineOptions batchOptions = buildPipelineOptions();
  batchOptions.setGcsUploadBufferSizeBytes(gcsUploadBufferSizeBytes);
  batchOptions.setRunner(DataflowRunner.class);
  Pipeline.create(batchOptions);
  assertEquals(gcsUploadBufferSizeBytes, batchOptions.getGcsUploadBufferSizeBytes().intValue());

  DataflowPipelineOptions streamingOptions = buildPipelineOptions();
  streamingOptions.setStreaming(true);
  streamingOptions.setGcsUploadBufferSizeBytes(gcsUploadBufferSizeBytes);
  streamingOptions.setRunner(DataflowRunner.class);
  Pipeline.create(streamingOptions);
  assertEquals(
      gcsUploadBufferSizeBytes, streamingOptions.getGcsUploadBufferSizeBytes().intValue());
}
 
Example 3
Source File: DataflowViewTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private Pipeline createTestStreamingRunner() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setStreaming(true);
  options.setProject("someproject");
  options.setRegion("some-region1");
  options.setGcpTempLocation("gs://staging");
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setDataflowClient(dataflow);
  return Pipeline.create(options);
}
 
Example 4
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Test that in translation the name for a collection (in this case just a Create output) is
 * overridden to be what the Dataflow service expects.
 */
@Test
public void testNamesOverridden() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  DataflowRunner runner = DataflowRunner.fromOptions(options);
  options.setStreaming(false);
  DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);

  Pipeline pipeline = Pipeline.create(options);

  pipeline.apply("Jazzy", Create.of(3)).setName("foobizzle");

  runner.replaceTransforms(pipeline);

  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  Job job =
      translator
          .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList())
          .getJob();

  // The Create step
  Step step = job.getSteps().get(0);

  // This is the name that is "set by the user" that the Dataflow translator must override
  String userSpecifiedName =
      getString(
          Structs.getListOfMaps(step.getProperties(), PropertyNames.OUTPUT_INFO, null).get(0),
          PropertyNames.USER_NAME);

  // This is the calculated name that must actually be used
  String calculatedName = getString(step.getProperties(), PropertyNames.USER_NAME) + ".out0";

  assertThat(userSpecifiedName, equalTo(calculatedName));
}
 
Example 5
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testWorkerHarnessContainerImage() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);

  // default image set
  options.setWorkerHarnessContainerImage("some-container");
  assertThat(getContainerImageForJob(options), equalTo("some-container"));

  // batch, legacy
  options.setWorkerHarnessContainerImage("gcr.io/IMAGE/foo");
  options.setExperiments(null);
  options.setStreaming(false);
  System.setProperty("java.specification.version", "1.8");
  assertThat(getContainerImageForJob(options), equalTo("gcr.io/beam-java-batch/foo"));
  // batch, legacy, jdk11
  options.setStreaming(false);
  System.setProperty("java.specification.version", "11");
  assertThat(getContainerImageForJob(options), equalTo("gcr.io/beam-java11-batch/foo"));
  // streaming, legacy
  System.setProperty("java.specification.version", "1.8");
  options.setStreaming(true);
  assertThat(getContainerImageForJob(options), equalTo("gcr.io/beam-java-streaming/foo"));
  // streaming, legacy, jdk11
  System.setProperty("java.specification.version", "11");
  assertThat(getContainerImageForJob(options), equalTo("gcr.io/beam-java11-streaming/foo"));
  // streaming, fnapi
  options.setExperiments(ImmutableList.of("experiment1", "beam_fn_api"));
  assertThat(getContainerImageForJob(options), equalTo("gcr.io/java/foo"));
}
 
Example 6
Source File: PubsubWordCount.java    From cloud-bigtable-examples with Apache License 2.0 5 votes vote down vote up
private static void injectMessages(BigtablePubsubOptions options) {
  String inputFile = options.getInputFile();
  String topic = options.getPubsubTopic();
  DataflowPipelineOptions copiedOptions = options.as(DataflowPipelineOptions.class);
  copiedOptions.setStreaming(false);
  copiedOptions.setNumWorkers(INJECTORNUMWORKERS);
  copiedOptions.setJobName(copiedOptions.getJobName() + "-injector");
  Pipeline injectorPipeline = Pipeline.create(copiedOptions);
  injectorPipeline.apply(TextIO.read().from(inputFile))
      .apply(ParDo.of(new FilterEmptyStringsFn()))
      .apply(PubsubIO.writeStrings().to(topic));
  injectorPipeline.run().waitUntilFinish();
}
 
Example 7
Source File: PubSubToBQPipeline.java    From pubsub-to-bigquery with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws GeneralSecurityException, IOException, ParseException, ParserConfigurationException, SAXException {
	String params = null;
	for (int i = 0; i < args.length; i++) {
		if (args[i].startsWith("--params="))
			params = args[i].replaceFirst("--params=", "");
	}

	System.out.println(params);
	init(params);

	GoogleCredentials credentials = ServiceAccountCredentials.fromStream(new FileInputStream(keyFile))
	        .createScoped(Arrays.asList(new String[] { "https://www.googleapis.com/auth/cloud-platform" }));

	DataflowPipelineOptions options = PipelineOptionsFactory.create().as(DataflowPipelineOptions.class);
	
	options.setRunner(DataflowRunner.class);
	// Your project ID is required in order to run your pipeline on the Google Cloud.
	options.setProject(projectId);
	// Your Google Cloud Storage path is required for staging local files.
	options.setStagingLocation(workingBucket);
	options.setTempLocation(workingBucket + "/temp");
	options.setGcpCredential(credentials);
	options.setServiceAccount(accountEmail);
	options.setMaxNumWorkers(maxNumWorkers);
	options.setDiskSizeGb(diskSizeGb);
	options.setWorkerMachineType(machineType);
	options.setAutoscalingAlgorithm(AutoscalingAlgorithmType.THROUGHPUT_BASED);
	options.setZone(zone);
	options.setStreaming(isStreaming);
	options.setJobName(pipelineName);
	Pipeline pipeline = Pipeline.create(options);
	
	Gson gson = new Gson();
	TableSchema schema = gson.fromJson(schemaStr, TableSchema.class);
	
	PCollection<String> streamData = null;
	if(pubSubTopicSub != null && !StringUtils.isEmpty(pubSubTopicSub)){
		streamData = pipeline.apply("ReadPubSub",PubsubIO.readStrings().fromSubscription(String.format("projects/%1$s/subscriptions/%2$s",projectId,pubSubTopicSub)));
	}
	else if(pubSubTopic != null && !StringUtils.isEmpty(pubSubTopic)){
		streamData = pipeline.apply("ReadPubSub",PubsubIO.readStrings().fromTopic(String.format("projects/%1$s/topics/%2$s",projectId,pubSubTopic)));
	}
	
	PCollection<TableRow> tableRow = streamData.apply("ToTableRow",ParDo.of(new PrepData.ToTableRow(owTimestamp, debugMode)));
	
	
	tableRow.apply("WriteToBQ",
			BigQueryIO.writeTableRows()
			.to(String.format("%1$s.%2$s",bqDataSet, bqTable))
			.withSchema(schema)
			.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));

	System.out.println("Starting pipeline " + pipelineName);
	pipeline.run();
}
 
Example 8
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 4 votes vote down vote up
/**
 * Test that in translation the name for collections of a multi-output ParDo - a special case
 * because the user can name tags - are overridden to be what the Dataflow service expects.
 */
@Test
public void testTaggedNamesOverridden() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  DataflowRunner runner = DataflowRunner.fromOptions(options);
  options.setStreaming(false);
  DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);

  Pipeline pipeline = Pipeline.create(options);

  TupleTag<Integer> tag1 = new TupleTag<Integer>("frazzle") {};
  TupleTag<Integer> tag2 = new TupleTag<Integer>("bazzle") {};
  TupleTag<Integer> tag3 = new TupleTag<Integer>() {};

  PCollectionTuple outputs =
      pipeline
          .apply(Create.of(3))
          .apply(
              ParDo.of(
                      new DoFn<Integer, Integer>() {
                        @ProcessElement
                        public void drop() {}
                      })
                  .withOutputTags(tag1, TupleTagList.of(tag2).and(tag3)));

  outputs.get(tag1).setName("bizbazzle");
  outputs.get(tag2).setName("gonzaggle");
  outputs.get(tag3).setName("froonazzle");

  runner.replaceTransforms(pipeline);

  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  Job job =
      translator
          .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList())
          .getJob();

  // The ParDo step
  Step step = job.getSteps().get(1);
  String stepName = getString(step.getProperties(), PropertyNames.USER_NAME);

  List<Map<String, Object>> outputInfos =
      Structs.getListOfMaps(step.getProperties(), PropertyNames.OUTPUT_INFO, null);

  assertThat(outputInfos.size(), equalTo(3));

  // The names set by the user _and_ the tags _must_ be ignored, or metrics will not show up.
  for (int i = 0; i < outputInfos.size(); ++i) {
    assertThat(
        getString(outputInfos.get(i), PropertyNames.USER_NAME),
        equalTo(String.format("%s.out%s", stepName, i)));
  }
}
 
Example 9
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Smoke test to fail fast if translation of a stateful ParDo in batch breaks. */
@Test
public void testBatchStatefulParDoTranslation() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  DataflowRunner runner = DataflowRunner.fromOptions(options);
  options.setStreaming(false);
  DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);

  Pipeline pipeline = Pipeline.create(options);

  TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {};

  pipeline
      .apply(Create.of(KV.of(1, 1)))
      .apply(
          ParDo.of(
                  new DoFn<KV<Integer, Integer>, Integer>() {
                    @StateId("unused")
                    final StateSpec<ValueState<Integer>> stateSpec =
                        StateSpecs.value(VarIntCoder.of());

                    @ProcessElement
                    public void process(ProcessContext c) {
                      // noop
                    }
                  })
              .withOutputTags(mainOutputTag, TupleTagList.empty()));

  runner.replaceTransforms(pipeline);

  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  Job job =
      translator
          .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList())
          .getJob();

  // The job should look like:
  // 0. ParallelRead (Create)
  // 1. ParDo(ReifyWVs)
  // 2. GroupByKeyAndSortValuesONly
  // 3. A ParDo over grouped and sorted KVs that is executed via ungrouping service-side

  List<Step> steps = job.getSteps();
  assertEquals(4, steps.size());

  Step createStep = steps.get(0);
  assertEquals("ParallelRead", createStep.getKind());

  Step reifyWindowedValueStep = steps.get(1);
  assertEquals("ParallelDo", reifyWindowedValueStep.getKind());

  Step gbkStep = steps.get(2);
  assertEquals("GroupByKey", gbkStep.getKind());

  Step statefulParDoStep = steps.get(3);
  assertEquals("ParallelDo", statefulParDoStep.getKind());
  assertThat(
      (String) statefulParDoStep.getProperties().get(PropertyNames.USES_KEYED_STATE),
      not(equalTo("true")));
}
 
Example 10
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Smoke test to fail fast if translation of a splittable ParDo in streaming breaks. */
@Test
public void testStreamingSplittableParDoTranslation() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  DataflowRunner runner = DataflowRunner.fromOptions(options);
  options.setStreaming(true);
  DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);

  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> windowedInput =
      pipeline
          .apply(Create.of("a"))
          .apply(Window.into(FixedWindows.of(Duration.standardMinutes(1))));
  windowedInput.apply(ParDo.of(new TestSplittableFn()));

  runner.replaceTransforms(pipeline);

  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  Job job =
      translator
          .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList())
          .getJob();

  // The job should contain a SplittableParDo.ProcessKeyedElements step, translated as
  // "SplittableProcessKeyed".

  List<Step> steps = job.getSteps();
  Step processKeyedStep = null;
  for (Step step : steps) {
    if ("SplittableProcessKeyed".equals(step.getKind())) {
      assertNull(processKeyedStep);
      processKeyedStep = step;
    }
  }
  assertNotNull(processKeyedStep);

  @SuppressWarnings({"unchecked", "rawtypes"})
  DoFnInfo<String, Integer> fnInfo =
      (DoFnInfo<String, Integer>)
          SerializableUtils.deserializeFromByteArray(
              jsonStringToByteArray(
                  getString(processKeyedStep.getProperties(), PropertyNames.SERIALIZED_FN)),
              "DoFnInfo");
  assertThat(fnInfo.getDoFn(), instanceOf(TestSplittableFn.class));
  assertThat(
      fnInfo.getWindowingStrategy().getWindowFn(),
      Matchers.<WindowFn>equalTo(FixedWindows.of(Duration.standardMinutes(1))));
  assertThat(fnInfo.getInputCoder(), instanceOf(StringUtf8Coder.class));
  Coder<?> restrictionCoder =
      CloudObjects.coderFromCloudObject(
          (CloudObject)
              Structs.getObject(
                  processKeyedStep.getProperties(), PropertyNames.RESTRICTION_CODER));

  assertEquals(
      KvCoder.of(SerializableCoder.of(OffsetRange.class), VoidCoder.of()), restrictionCoder);
}