Java Code Examples for org.apache.beam.runners.dataflow.options.DataflowPipelineOptions#setRunner()

The following examples show how to use org.apache.beam.runners.dataflow.options.DataflowPipelineOptions#setRunner() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BatchStatefulParDoOverridesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private static DataflowPipelineOptions buildPipelineOptions(String... args) throws IOException {
  GcsUtil mockGcsUtil = mock(GcsUtil.class);
  when(mockGcsUtil.expand(any(GcsPath.class)))
      .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
  when(mockGcsUtil.bucketAccessible(any(GcsPath.class))).thenReturn(true);

  DataflowPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setGcpCredential(new TestCredential());
  options.setJobName("some-job-name");
  options.setProject("some-project");
  options.setRegion("some-region");
  options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString());
  options.setFilesToStage(new ArrayList<>());
  options.setGcsUtil(mockGcsUtil);

  // Enable the FileSystems API to know about gs:// URIs in this test.
  FileSystems.setDefaultPipelineOptions(options);

  return options;
}
 
Example 2
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGcsUploadBufferSizeUnchangedWhenNotDefault() throws IOException {
  int gcsUploadBufferSizeBytes = 12345678;
  DataflowPipelineOptions batchOptions = buildPipelineOptions();
  batchOptions.setGcsUploadBufferSizeBytes(gcsUploadBufferSizeBytes);
  batchOptions.setRunner(DataflowRunner.class);
  Pipeline.create(batchOptions);
  assertEquals(gcsUploadBufferSizeBytes, batchOptions.getGcsUploadBufferSizeBytes().intValue());

  DataflowPipelineOptions streamingOptions = buildPipelineOptions();
  streamingOptions.setStreaming(true);
  streamingOptions.setGcsUploadBufferSizeBytes(gcsUploadBufferSizeBytes);
  streamingOptions.setRunner(DataflowRunner.class);
  Pipeline.create(streamingOptions);
  assertEquals(
      gcsUploadBufferSizeBytes, streamingOptions.getGcsUploadBufferSizeBytes().intValue());
}
 
Example 3
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGcsUploadBufferSizeIsSetForStreamingWhenDefault() throws IOException {
  DataflowPipelineOptions streamingOptions = buildPipelineOptions();
  streamingOptions.setStreaming(true);
  streamingOptions.setRunner(DataflowRunner.class);
  Pipeline p = Pipeline.create(streamingOptions);

  // Instantiation of a runner prior to run() currently has a side effect of mutating the options.
  // This could be tested by DataflowRunner.fromOptions(streamingOptions) but would not ensure
  // that the pipeline itself had the expected options set.
  p.run();

  assertEquals(
      DataflowRunner.GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT,
      streamingOptions.getGcsUploadBufferSizeBytes().intValue());
}
 
Example 4
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private DataflowPipelineOptions buildPipelineOptions() throws IOException {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setProject(PROJECT_ID);
  options.setTempLocation(VALID_TEMP_BUCKET);
  options.setRegion(REGION_ID);
  // Set FILES_PROPERTY to empty to prevent a default value calculated from classpath.
  options.setFilesToStage(new ArrayList<>());
  options.setDataflowClient(buildMockDataflow());
  options.setGcsUtil(mockGcsUtil);
  options.setGcpCredential(new TestCredential());

  // Configure the FileSystem registrar to use these options.
  FileSystems.setDefaultPipelineOptions(options);

  return options;
}
 
Example 5
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that the {@link DataflowRunner} with {@code --templateLocation} returns normally when the
 * runner is successfully run with upload_graph experiment turned on. The result template should
 * not contain raw steps and stepsLocation file should be set.
 */
@Test
public void testTemplateRunnerWithUploadGraph() throws Exception {
  File existingFile = tmpFolder.newFile();
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setExperiments(Arrays.asList("upload_graph"));
  options.setJobName("TestJobName");
  options.setGcpCredential(new TestCredential());
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setProject("test-project");
  options.setRegion(REGION_ID);
  options.setRunner(DataflowRunner.class);
  options.setTemplateLocation(existingFile.getPath());
  options.setTempLocation(tmpFolder.getRoot().getPath());
  Pipeline p = Pipeline.create(options);
  p.apply(Create.of(ImmutableList.of(1)));
  p.run();
  expectedLogs.verifyInfo("Template successfully created");
  ObjectMapper objectMapper = new ObjectMapper();
  JsonNode node = objectMapper.readTree(existingFile);
  assertEquals(0, node.get("steps").size());
  assertNotNull(node.get("stepsLocation"));
}
 
Example 6
Source File: BatchStatefulParDoOverridesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore(
    "TODO: BEAM-2902 Add support for user state in a ParDo.Multi once PTransformMatcher "
        + "exposes a way to know when the replacement is not required by checking that the "
        + "preceding ParDos to a GBK are key preserving.")
public void testFnApiMultiOutputOverrideNonCrashing() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions("--experiments=beam_fn_api");
  options.setRunner(DataflowRunner.class);
  Pipeline pipeline = Pipeline.create(options);

  TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {};
  TupleTag<Integer> sideOutputTag = new TupleTag<Integer>() {};

  DummyStatefulDoFn fn = new DummyStatefulDoFn();
  pipeline
      .apply(Create.of(KV.of(1, 2)))
      .apply(ParDo.of(fn).withOutputTags(mainOutputTag, TupleTagList.of(sideOutputTag)));

  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(pipeline);
  assertThat(findBatchStatefulDoFn(pipeline), equalTo((DoFn) fn));
}
 
Example 7
Source File: BatchStatefulParDoOverridesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testMultiOutputOverrideNonCrashing() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  options.setRunner(DataflowRunner.class);
  Pipeline pipeline = Pipeline.create(options);

  TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {};
  TupleTag<Integer> sideOutputTag = new TupleTag<Integer>() {};

  DummyStatefulDoFn fn = new DummyStatefulDoFn();
  pipeline
      .apply(Create.of(KV.of(1, 2)))
      .apply(ParDo.of(fn).withOutputTags(mainOutputTag, TupleTagList.of(sideOutputTag)));

  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(pipeline);
  assertThat(findBatchStatefulDoFn(pipeline), equalTo((DoFn) fn));
}
 
Example 8
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that the {@link DataflowRunner} with {@code --templateLocation} throws the appropriate
 * exception when an output file is not writable.
 */
@Test
public void testTemplateRunnerLoggedErrorForFile() throws Exception {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setJobName("TestJobName");
  options.setRunner(DataflowRunner.class);
  options.setTemplateLocation("//bad/path");
  options.setProject("test-project");
  options.setRegion(REGION_ID);
  options.setTempLocation(tmpFolder.getRoot().getPath());
  options.setGcpCredential(new TestCredential());
  options.setPathValidatorClass(NoopPathValidator.class);
  Pipeline p = Pipeline.create(options);

  thrown.expectMessage("Cannot create output file at");
  thrown.expect(RuntimeException.class);
  p.run();
}
 
Example 9
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private static DataflowPipelineOptions buildPipelineOptions() throws IOException {
  GcsUtil mockGcsUtil = mock(GcsUtil.class);
  when(mockGcsUtil.expand(any(GcsPath.class)))
      .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
  when(mockGcsUtil.bucketAccessible(any(GcsPath.class))).thenReturn(true);

  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setGcpCredential(new TestCredential());
  options.setJobName("some-job-name");
  options.setProject("some-project");
  options.setRegion("some-region");
  options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString());
  options.setFilesToStage(new ArrayList<>());
  options.setDataflowClient(buildMockDataflow(new IsValidCreateRequest()));
  options.setGcsUtil(mockGcsUtil);

  // Enable the FileSystems API to know about gs:// URIs in this test.
  FileSystems.setDefaultPipelineOptions(options);

  return options;
}
 
Example 10
Source File: BatchStatefulParDoOverridesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSingleOutputOverrideNonCrashing() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  options.setRunner(DataflowRunner.class);
  Pipeline pipeline = Pipeline.create(options);

  DummyStatefulDoFn fn = new DummyStatefulDoFn();
  pipeline.apply(Create.of(KV.of(1, 2))).apply(ParDo.of(fn));

  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(pipeline);
  assertThat(findBatchStatefulDoFn(pipeline), equalTo((DoFn) fn));
}
 
Example 11
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private Pipeline buildPipeline(DataflowPipelineOptions options) {
  options.setRunner(DataflowRunner.class);
  Pipeline p = Pipeline.create(options);

  p.apply("ReadMyFile", TextIO.read().from("gs://bucket/object"))
      .apply("WriteMyFile", TextIO.write().to("gs://bucket/object"));
  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(p);

  return p;
}
 
Example 12
Source File: DataflowViewTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private Pipeline createTestStreamingRunner() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setStreaming(true);
  options.setProject("someproject");
  options.setRegion("some-region1");
  options.setGcpTempLocation("gs://staging");
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setDataflowClient(dataflow);
  return Pipeline.create(options);
}
 
Example 13
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testGcsStagingLocationInitialization() throws Exception {
  // Set temp location (required), and check that staging location is set.
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setTempLocation(VALID_TEMP_BUCKET);
  options.setProject(PROJECT_ID);
  options.setRegion(REGION_ID);
  options.setGcpCredential(new TestCredential());
  options.setGcsUtil(mockGcsUtil);
  options.setRunner(DataflowRunner.class);

  DataflowRunner.fromOptions(options);

  assertNotNull(options.getStagingLocation());
}
 
Example 14
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoStagingLocationAndNoTempLocationFails() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setProject("foo-project");
  options.setRegion(REGION_ID);

  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(
      "DataflowRunner requires gcpTempLocation, "
          + "but failed to retrieve a value from PipelineOption");
  DataflowRunner.fromOptions(options);
}
 
Example 15
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testGcpTempAndNoTempLocationSucceeds() throws Exception {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setGcpCredential(new TestCredential());
  options.setProject("foo-project");
  options.setRegion(REGION_ID);
  options.setGcpTempLocation(VALID_TEMP_BUCKET);
  options.setGcsUtil(mockGcsUtil);

  DataflowRunner.fromOptions(options);
}
 
Example 16
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testGcsUploadBufferSizeIsUnsetForBatchWhenDefault() throws IOException {
  DataflowPipelineOptions batchOptions = buildPipelineOptions();
  batchOptions.setRunner(DataflowRunner.class);
  Pipeline.create(batchOptions);
  assertNull(batchOptions.getGcsUploadBufferSizeBytes());
}
 
Example 17
Source File: DataflowViewTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private Pipeline createTestBatchRunner() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setProject("someproject");
  options.setRegion("some-region1");
  options.setGcpTempLocation("gs://staging");
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setDataflowClient(dataflow);
  return Pipeline.create(options);
}
 
Example 18
Source File: DataflowGroupByKeyTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Create a test pipeline that uses the {@link DataflowRunner} so that {@link GroupByKey} is not
 * expanded. This is used for verifying that even without expansion the proper errors show up.
 */
private Pipeline createTestServiceRunner() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setProject("someproject");
  options.setRegion("some-region1");
  options.setGcpTempLocation("gs://staging");
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setDataflowClient(dataflow);
  return Pipeline.create(options);
}
 
Example 19
Source File: PubSubToBQPipeline.java    From pubsub-to-bigquery with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws GeneralSecurityException, IOException, ParseException, ParserConfigurationException, SAXException {
	String params = null;
	for (int i = 0; i < args.length; i++) {
		if (args[i].startsWith("--params="))
			params = args[i].replaceFirst("--params=", "");
	}

	System.out.println(params);
	init(params);

	GoogleCredentials credentials = ServiceAccountCredentials.fromStream(new FileInputStream(keyFile))
	        .createScoped(Arrays.asList(new String[] { "https://www.googleapis.com/auth/cloud-platform" }));

	DataflowPipelineOptions options = PipelineOptionsFactory.create().as(DataflowPipelineOptions.class);
	
	options.setRunner(DataflowRunner.class);
	// Your project ID is required in order to run your pipeline on the Google Cloud.
	options.setProject(projectId);
	// Your Google Cloud Storage path is required for staging local files.
	options.setStagingLocation(workingBucket);
	options.setTempLocation(workingBucket + "/temp");
	options.setGcpCredential(credentials);
	options.setServiceAccount(accountEmail);
	options.setMaxNumWorkers(maxNumWorkers);
	options.setDiskSizeGb(diskSizeGb);
	options.setWorkerMachineType(machineType);
	options.setAutoscalingAlgorithm(AutoscalingAlgorithmType.THROUGHPUT_BASED);
	options.setZone(zone);
	options.setStreaming(isStreaming);
	options.setJobName(pipelineName);
	Pipeline pipeline = Pipeline.create(options);
	
	Gson gson = new Gson();
	TableSchema schema = gson.fromJson(schemaStr, TableSchema.class);
	
	PCollection<String> streamData = null;
	if(pubSubTopicSub != null && !StringUtils.isEmpty(pubSubTopicSub)){
		streamData = pipeline.apply("ReadPubSub",PubsubIO.readStrings().fromSubscription(String.format("projects/%1$s/subscriptions/%2$s",projectId,pubSubTopicSub)));
	}
	else if(pubSubTopic != null && !StringUtils.isEmpty(pubSubTopic)){
		streamData = pipeline.apply("ReadPubSub",PubsubIO.readStrings().fromTopic(String.format("projects/%1$s/topics/%2$s",projectId,pubSubTopic)));
	}
	
	PCollection<TableRow> tableRow = streamData.apply("ToTableRow",ParDo.of(new PrepData.ToTableRow(owTimestamp, debugMode)));
	
	
	tableRow.apply("WriteToBQ",
			BigQueryIO.writeTableRows()
			.to(String.format("%1$s.%2$s",bqDataSet, bqTable))
			.withSchema(schema)
			.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));

	System.out.println("Starting pipeline " + pipelineName);
	pipeline.run();
}
 
Example 20
Source File: WorkerCustomSourcesSplitOnlySourceTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testAllSplitsAreReturned() throws Exception {
  final long apiSizeLimitForTest = 500 * 1024;
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setAppName("TestAppName");
  options.setProject("test-project");
  options.setRegion("some-region1");
  options.setTempLocation("gs://test/temp/location");
  options.setGcpCredential(new TestCredential());
  options.setRunner(DataflowRunner.class);
  options.setPathValidatorClass(NoopPathValidator.class);
  // Generate a CountingSource and split it into the desired number of splits
  // (desired size = 1 byte), triggering the re-split with a larger bundle size.
  // Thus below we expect to produce 'numberOfSplits' splits.
  com.google.api.services.dataflow.model.Source source =
      WorkerCustomSourcesTest.translateIOToCloudSource(
          CountingSource.upTo(numberOfSplits), options);
  SourceSplitResponse split =
      WorkerCustomSourcesTest.performSplit(
          source, options, 1L, null /* numBundles limit */, apiSizeLimitForTest);
  assertThat(
      split.getBundles().size(),
      lessThanOrEqualTo(WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT));

  List<OffsetBasedSource<?>> originalSplits = new ArrayList<>(numberOfSplits);
  // Collect all the splits
  for (DerivedSource derivedSource : split.getBundles()) {
    Object deserializedSource =
        WorkerCustomSources.deserializeFromCloudSource(derivedSource.getSource().getSpec());
    if (deserializedSource instanceof SplittableOnlyBoundedSource) {
      SplittableOnlyBoundedSource<?> splittableOnlySource =
          (SplittableOnlyBoundedSource<?>) deserializedSource;
      originalSplits.addAll((List) splittableOnlySource.split(1L, options));
    } else {
      originalSplits.add((OffsetBasedSource<?>) deserializedSource);
    }
  }

  assertEquals(numberOfSplits, originalSplits.size());
  for (int i = 0; i < originalSplits.size(); i++) {
    OffsetBasedSource<?> offsetBasedSource = (OffsetBasedSource<?>) originalSplits.get(i);
    assertEquals(i, offsetBasedSource.getStartOffset());
    assertEquals(i + 1, offsetBasedSource.getEndOffset());
  }
}