Java Code Examples for org.apache.beam.sdk.options.PipelineOptionsFactory#create()

The following examples show how to use org.apache.beam.sdk.options.PipelineOptionsFactory#create() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BoundedReadFromUnboundedSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCanResumeWithExpandedCount() throws IOException {
  BoundedReadFromUnboundedSourceTest.TestCountingSource source =
      new BoundedReadFromUnboundedSourceTest.TestCountingSource(1);
  PipelineOptions options = PipelineOptionsFactory.create();
  BoundedReadFromUnboundedSourceTest.TestCountingSource.CountingSourceReader reader =
      source.createReader(options, null /* no checkpoint */);
  assertTrue(reader.start());
  assertEquals(0L, (long) reader.getCurrent().getValue());
  assertFalse(reader.advance());
  BoundedReadFromUnboundedSourceTest.TestCountingSource.CounterMark checkpoint =
      reader.getCheckpointMark();
  checkpoint.finalizeCheckpoint();
  source = new BoundedReadFromUnboundedSourceTest.TestCountingSource(2);
  reader = source.createReader(options, checkpoint);
  assertTrue(reader.start());
  assertEquals(1L, (long) reader.getCurrent().getValue());
  assertFalse(reader.advance());
}
 
Example 2
Source File: BigQueryServicesImplTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testIsTableEmptyNoRetryForNotFound() throws IOException, InterruptedException {
  when(response.getContentType()).thenReturn(Json.MEDIA_TYPE);
  when(response.getStatusCode()).thenReturn(404);

  BigQueryServicesImpl.DatasetServiceImpl datasetService =
      new BigQueryServicesImpl.DatasetServiceImpl(bigquery, PipelineOptionsFactory.create());

  TableReference tableRef =
      new TableReference()
          .setProjectId("projectId")
          .setDatasetId("datasetId")
          .setTableId("tableId");

  thrown.expect(IOException.class);
  thrown.expectMessage(String.format("Unable to list table data: %s", tableRef.getTableId()));

  try {
    datasetService.isTableEmpty(tableRef, BackOff.ZERO_BACKOFF, Sleeper.DEFAULT);
  } finally {
    verify(response, times(1)).getStatusCode();
    verify(response, times(1)).getContent();
    verify(response, times(1)).getContentType();
  }
}
 
Example 3
Source File: BigQueryServicesImplTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetTableNotFound() throws IOException, InterruptedException {
  when(response.getContentType()).thenReturn(Json.MEDIA_TYPE);
  when(response.getStatusCode()).thenReturn(404);

  BigQueryServicesImpl.DatasetServiceImpl datasetService =
      new BigQueryServicesImpl.DatasetServiceImpl(bigquery, PipelineOptionsFactory.create());

  TableReference tableRef =
      new TableReference()
          .setProjectId("projectId")
          .setDatasetId("datasetId")
          .setTableId("tableId");
  Table table = datasetService.getTable(tableRef, null, BackOff.ZERO_BACKOFF, Sleeper.DEFAULT);

  assertNull(table);
  verify(response, times(1)).getStatusCode();
  verify(response, times(1)).getContent();
  verify(response, times(1)).getContentType();
}
 
Example 4
Source File: PipelineTranslationModeOptimizerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testBoundedCollectionProducingTransform() {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(FlinkRunner.class);
  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply(GenerateSequence.from(0).to(10));

  assertThat(PipelineTranslationModeOptimizer.hasUnboundedOutput(pipeline), is(false));
}
 
Example 5
Source File: UnboundedReadFromBoundedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private <T> void testBoundedToUnboundedSourceAdapterCheckpointRestart(
    BoundedSource<T> boundedSource, List<T> expectedElements) throws Exception {
  BoundedToUnboundedSourceAdapter<T> unboundedSource =
      new BoundedToUnboundedSourceAdapter<>(boundedSource);

  PipelineOptions options = PipelineOptionsFactory.create();
  BoundedToUnboundedSourceAdapter<T>.Reader reader = unboundedSource.createReader(options, null);

  List<T> actual = Lists.newArrayList();
  for (boolean hasNext = reader.start(); hasNext; ) {
    actual.add(reader.getCurrent());
    // checkpoint every 9 elements
    if (actual.size() % 9 == 0) {
      Checkpoint<T> checkpoint = reader.getCheckpointMark();
      Coder<Checkpoint<T>> checkpointCoder = unboundedSource.getCheckpointMarkCoder();
      Checkpoint<T> decodedCheckpoint =
          CoderUtils.decodeFromByteArray(
              checkpointCoder, CoderUtils.encodeToByteArray(checkpointCoder, checkpoint));
      reader.close();
      checkpoint.finalizeCheckpoint();

      BoundedToUnboundedSourceAdapter<T>.Reader restarted =
          unboundedSource.createReader(options, decodedCheckpoint);
      reader = restarted;
      hasNext = reader.start();
    } else {
      hasNext = reader.advance();
    }
  }
  Checkpoint<T> checkpointDone = reader.getCheckpointMark();
  assertTrue(
      checkpointDone.getResidualElements() == null
          || checkpointDone.getResidualElements().isEmpty());

  assertEquals(expectedElements.size(), actual.size());
  assertEquals(Sets.newHashSet(expectedElements), Sets.newHashSet(actual));
}
 
Example 6
Source File: JmsIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSplitForTopic() throws Exception {
  JmsIO.Read read = JmsIO.read().withTopic(TOPIC);
  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
  int desiredNumSplits = 5;
  JmsIO.UnboundedJmsSource initialSource = new JmsIO.UnboundedJmsSource(read);
  List<JmsIO.UnboundedJmsSource> splits = initialSource.split(desiredNumSplits, pipelineOptions);
  // in the case of a topic, we can have only a unique subscriber on the topic per pipeline
  // else it means we can have duplicate messages (all subscribers on the topic receive every
  // message).
  // So, whatever the desizedNumSplits is, the actual number of splits should be 1.
  assertEquals(1, splits.size());
}
 
Example 7
Source File: XmlSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSplitAtFractionExhaustiveSingleByte() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  File file = tempFolder.newFile("trainXMLSmall");
  Files.write(file.toPath(), trainXMLWithAllFeaturesSingleByte.getBytes(StandardCharsets.UTF_8));

  BoundedSource<Train> source =
      XmlIO.<Train>read()
          .from(file.toPath().toString())
          .withRootElement("trains")
          .withRecordElement("train")
          .withRecordClass(Train.class)
          .createSource();
  assertSplitAtFractionExhaustive(source, options);
}
 
Example 8
Source File: BigQueryServicesImplTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Tests that {@link DatasetServiceImpl#insertAll} retries quota exceeded attempts. */
@Test
public void testInsertQuotaExceededRetry() throws Exception {
  TableReference ref =
      new TableReference().setProjectId("project").setDatasetId("dataset").setTableId("table");
  List<ValueInSingleWindow<TableRow>> rows = new ArrayList<>();
  rows.add(wrapValue(new TableRow()));

  // First response is 403 quota exceeded, second response has valid payload.
  when(response.getContentType()).thenReturn(Json.MEDIA_TYPE);
  when(response.getStatusCode()).thenReturn(403).thenReturn(200);
  when(response.getContent())
      .thenReturn(toStream(errorWithReasonAndStatus("quotaExceeded", 403)))
      .thenReturn(toStream(new TableDataInsertAllResponse()));

  DatasetServiceImpl dataService =
      new DatasetServiceImpl(bigquery, PipelineOptionsFactory.create());
  dataService.insertAll(
      ref,
      rows,
      null,
      BackOffAdapter.toGcpBackOff(TEST_BACKOFF.backoff()),
      new MockSleeper(),
      InsertRetryPolicy.alwaysRetry(),
      null,
      null,
      false,
      false,
      false);
  verify(response, times(2)).getStatusCode();
  verify(response, times(2)).getContent();
  verify(response, times(2)).getContentType();
  expectedLogs.verifyInfo("BigQuery insertAll error, retrying:");
}
 
Example 9
Source File: AvroByteSinkFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private Sink<?> runTestCreateAvroSink(String filename, Coder<?> coder) throws Exception {
  CloudObject spec = CloudObject.forClassName("AvroSink");
  addString(spec, "filename", filename);
  PipelineOptions options = PipelineOptionsFactory.create();

  AvroByteSinkFactory factory = new AvroByteSinkFactory();
  Sink<?> sink =
      factory.create(
          spec,
          coder,
          options,
          BatchModeExecutionContext.forTesting(options, "testStage"),
          TestOperationContext.create());
  return sink;
}
 
Example 10
Source File: BigQueryServicesImplTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Tests that {@link BigQueryServicesImpl} retries quota rate limited attempts. */
@Test
public void testCreateTableRetry() throws IOException {
  TableReference ref =
      new TableReference().setProjectId("project").setDatasetId("dataset").setTableId("table");
  Table testTable = new Table().setTableReference(ref);

  // First response is 403 rate limited, second response has valid payload.
  when(response.getContentType()).thenReturn(Json.MEDIA_TYPE);
  when(response.getStatusCode()).thenReturn(403).thenReturn(200);
  when(response.getContent())
      .thenReturn(toStream(errorWithReasonAndStatus("rateLimitExceeded", 403)))
      .thenReturn(toStream(testTable));

  BigQueryServicesImpl.DatasetServiceImpl services =
      new BigQueryServicesImpl.DatasetServiceImpl(bigquery, PipelineOptionsFactory.create());
  Table ret =
      services.tryCreateTable(
          testTable, new RetryBoundedBackOff(3, BackOff.ZERO_BACKOFF), Sleeper.DEFAULT);
  assertEquals(testTable, ret);
  verify(response, times(2)).getStatusCode();
  verify(response, times(2)).getContent();
  verify(response, times(2)).getContentType();
  verifyNotNull(ret.getTableReference());
  expectedLogs.verifyInfo(
      "Quota limit reached when creating table project:dataset.table, "
          + "retrying up to 5.0 minutes");
}
 
Example 11
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadRangeFromFileWithSplitsFromMiddleOfHeader() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  String header = "<h>";
  List<String> data = new ArrayList<>();
  for (int i = 0; i < 10; i++) {
    data.add(header);
    data.addAll(createStringDataset(3, 9));
  }
  String fileName = "file";
  File file = createFileWithData(fileName, data);

  List<String> expectedResults = new ArrayList<>();
  expectedResults.addAll(data.subList(10, data.size()));
  // Remove all occurrences of header from expected results.
  expectedResults.removeAll(Collections.singletonList(header));

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  // Split starts after "<" of the header
  TestFileBasedSource source = new TestFileBasedSource(metadata, 64, 1, Long.MAX_VALUE, header);
  assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));

  // Split starts after "<h" of the header
  source = new TestFileBasedSource(metadata, 64, 2, Long.MAX_VALUE, header);
  assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));

  // Split starts after "<h>" of the header
  source = new TestFileBasedSource(metadata, 64, 3, Long.MAX_VALUE, header);
  assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
 
Example 12
Source File: BeamFnDataGrpcClientTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testForInboundConsumerThatThrows() throws Exception {
  CountDownLatch waitForClientToConnect = new CountDownLatch(1);
  AtomicInteger consumerInvoked = new AtomicInteger();
  Collection<BeamFnApi.Elements> inboundServerValues = new ConcurrentLinkedQueue<>();
  AtomicReference<StreamObserver<BeamFnApi.Elements>> outboundServerObserver =
      new AtomicReference<>();
  CallStreamObserver<BeamFnApi.Elements> inboundServerObserver =
      TestStreams.withOnNext(inboundServerValues::add).build();

  Endpoints.ApiServiceDescriptor apiServiceDescriptor =
      Endpoints.ApiServiceDescriptor.newBuilder()
          .setUrl(this.getClass().getName() + "-" + UUID.randomUUID().toString())
          .build();
  Server server =
      InProcessServerBuilder.forName(apiServiceDescriptor.getUrl())
          .addService(
              new BeamFnDataGrpc.BeamFnDataImplBase() {
                @Override
                public StreamObserver<BeamFnApi.Elements> data(
                    StreamObserver<BeamFnApi.Elements> outboundObserver) {
                  outboundServerObserver.set(outboundObserver);
                  waitForClientToConnect.countDown();
                  return inboundServerObserver;
                }
              })
          .build();
  server.start();
  RuntimeException exceptionToThrow = new RuntimeException("TestFailure");
  try {
    ManagedChannel channel =
        InProcessChannelBuilder.forName(apiServiceDescriptor.getUrl()).build();

    BeamFnDataGrpcClient clientFactory =
        new BeamFnDataGrpcClient(
            PipelineOptionsFactory.create(),
            (Endpoints.ApiServiceDescriptor descriptor) -> channel,
            OutboundObserverFactory.trivial());

    InboundDataClient readFuture =
        clientFactory.receive(
            apiServiceDescriptor,
            ENDPOINT_A,
            CODER,
            t -> {
              consumerInvoked.incrementAndGet();
              throw exceptionToThrow;
            });

    waitForClientToConnect.await();

    // This first message should cause a failure afterwards all other messages are dropped.
    outboundServerObserver.get().onNext(ELEMENTS_A_1);
    outboundServerObserver.get().onNext(ELEMENTS_A_2);

    try {
      readFuture.awaitCompletion();
      fail("Expected channel to fail");
    } catch (ExecutionException e) {
      assertEquals(exceptionToThrow, e.getCause());
    }
    // The server should not have received any values
    assertThat(inboundServerValues, empty());
    // The consumer should have only been invoked once
    assertEquals(1, consumerInvoked.get());
  } finally {
    server.shutdownNow();
  }
}
 
Example 13
Source File: BigQueryIOReadTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testBigQueryQuerySourceInitSplit() throws Exception {

  PipelineOptions options = PipelineOptionsFactory.create();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");

  TableReference sourceTableRef = BigQueryHelpers.parseTableSpec("project:dataset.table");

  fakeDatasetService.createDataset(
      sourceTableRef.getProjectId(),
      sourceTableRef.getDatasetId(),
      "asia-northeast1",
      "Fake plastic tree^H^H^H^Htables",
      null);

  fakeDatasetService.createTable(
      new Table().setTableReference(sourceTableRef).setLocation("asia-northeast1"));

  Table queryResultTable =
      new Table()
          .setSchema(
              new TableSchema()
                  .setFields(
                      ImmutableList.of(
                          new TableFieldSchema().setName("name").setType("STRING"),
                          new TableFieldSchema().setName("number").setType("INTEGER"))));

  List<TableRow> expected =
      ImmutableList.of(
          new TableRow().set("name", "a").set("number", 1L),
          new TableRow().set("name", "b").set("number", 2L),
          new TableRow().set("name", "c").set("number", 3L),
          new TableRow().set("name", "d").set("number", 4L),
          new TableRow().set("name", "e").set("number", 5L),
          new TableRow().set("name", "f").set("number", 6L));

  String encodedQuery = FakeBigQueryServices.encodeQueryResult(queryResultTable, expected);

  String stepUuid = "testStepUuid";

  TableReference tempTableReference =
      createTempTableReference(
          bqOptions.getProject(),
          createJobIdToken(options.getJobName(), stepUuid),
          Optional.empty());

  fakeJobService.expectDryRunQuery(
      bqOptions.getProject(),
      encodedQuery,
      new JobStatistics()
          .setQuery(
              new JobStatistics2()
                  .setTotalBytesProcessed(100L)
                  .setReferencedTables(ImmutableList.of(sourceTableRef, tempTableReference))));

  BoundedSource<TableRow> bqSource =
      BigQueryQuerySourceDef.create(
              fakeBqServices,
              ValueProvider.StaticValueProvider.of(encodedQuery),
              true /* flattenResults */,
              true /* useLegacySql */,
              QueryPriority.BATCH,
              null,
              null,
              null)
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  options.setTempLocation(testFolder.getRoot().getAbsolutePath());

  List<TableRow> read =
      convertStringsToLong(
          SourceTestUtils.readFromSplitsOfSource(bqSource, 0L /* ignored */, options));
  assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));

  List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
  assertEquals(2, sources.size());
}
 
Example 14
Source File: ProcessBundleHandlerTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testBundleProcessorIsResetWhenAddedBackToCache() throws Exception {
  BeamFnApi.ProcessBundleDescriptor processBundleDescriptor =
      BeamFnApi.ProcessBundleDescriptor.newBuilder()
          .putTransforms(
              "2L",
              RunnerApi.PTransform.newBuilder()
                  .setSpec(RunnerApi.FunctionSpec.newBuilder().setUrn(DATA_INPUT_URN).build())
                  .build())
          .build();
  Map<String, Message> fnApiRegistry = ImmutableMap.of("1L", processBundleDescriptor);

  ProcessBundleHandler handler =
      new ProcessBundleHandler(
          PipelineOptionsFactory.create(),
          fnApiRegistry::get,
          beamFnDataClient,
          null /* beamFnStateGrpcClientCache */,
          null /* finalizeBundleHandler */,
          ImmutableMap.of(
              DATA_INPUT_URN,
              (pipelineOptions,
                  beamFnDataClient,
                  beamFnStateClient,
                  beamFnTimerClient,
                  pTransformId,
                  pTransform,
                  processBundleInstructionId,
                  pCollections,
                  coders,
                  windowingStrategies,
                  pCollectionConsumerRegistry,
                  startFunctionRegistry,
                  finishFunctionRegistry,
                  addProgressRequestCallback,
                  addTearDownFunction,
                  splitListener,
                  bundleFinalizer) -> null),
          new TestBundleProcessorCache());

  assertThat(TestBundleProcessor.resetCnt, equalTo(0));

  handler.processBundle(
      BeamFnApi.InstructionRequest.newBuilder()
          .setInstructionId("998L")
          .setProcessBundle(
              BeamFnApi.ProcessBundleRequest.newBuilder().setProcessBundleDescriptorId("1L"))
          .build());

  // Check that BundleProcessor is reset when added back to the cache
  assertThat(TestBundleProcessor.resetCnt, equalTo(1));

  // BundleProcessor is added back to the BundleProcessorCache
  assertThat(handler.bundleProcessorCache.getCachedBundleProcessors().size(), equalTo(1));
  assertThat(
      handler.bundleProcessorCache.getCachedBundleProcessors().get("1L").size(), equalTo(1));
}
 
Example 15
Source File: ExperimentContextTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testUnsetExperimentsInPipelineOptions() {
  PipelineOptions options = PipelineOptionsFactory.create();
  ExperimentContext ec = ExperimentContext.parseFrom(options);
  assertFalse(ec.isEnabled(Experiment.IntertransformIO));
}
 
Example 16
Source File: BeamFnDataGrpcClientTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testForInboundConsumer() throws Exception {
  CountDownLatch waitForClientToConnect = new CountDownLatch(1);
  Collection<WindowedValue<String>> inboundValuesA = new ConcurrentLinkedQueue<>();
  Collection<WindowedValue<String>> inboundValuesB = new ConcurrentLinkedQueue<>();
  Collection<BeamFnApi.Elements> inboundServerValues = new ConcurrentLinkedQueue<>();
  AtomicReference<StreamObserver<BeamFnApi.Elements>> outboundServerObserver =
      new AtomicReference<>();
  CallStreamObserver<BeamFnApi.Elements> inboundServerObserver =
      TestStreams.withOnNext(inboundServerValues::add).build();

  Endpoints.ApiServiceDescriptor apiServiceDescriptor =
      Endpoints.ApiServiceDescriptor.newBuilder()
          .setUrl(this.getClass().getName() + "-" + UUID.randomUUID().toString())
          .build();
  Server server =
      InProcessServerBuilder.forName(apiServiceDescriptor.getUrl())
          .addService(
              new BeamFnDataGrpc.BeamFnDataImplBase() {
                @Override
                public StreamObserver<BeamFnApi.Elements> data(
                    StreamObserver<BeamFnApi.Elements> outboundObserver) {
                  outboundServerObserver.set(outboundObserver);
                  waitForClientToConnect.countDown();
                  return inboundServerObserver;
                }
              })
          .build();
  server.start();
  try {
    ManagedChannel channel =
        InProcessChannelBuilder.forName(apiServiceDescriptor.getUrl()).build();

    BeamFnDataGrpcClient clientFactory =
        new BeamFnDataGrpcClient(
            PipelineOptionsFactory.create(),
            (Endpoints.ApiServiceDescriptor descriptor) -> channel,
            OutboundObserverFactory.trivial());

    InboundDataClient readFutureA =
        clientFactory.receive(apiServiceDescriptor, ENDPOINT_A, CODER, inboundValuesA::add);

    waitForClientToConnect.await();
    outboundServerObserver.get().onNext(ELEMENTS_A_1);
    // Purposefully transmit some data before the consumer for B is bound showing that
    // data is not lost
    outboundServerObserver.get().onNext(ELEMENTS_B_1);
    Thread.sleep(100);

    InboundDataClient readFutureB =
        clientFactory.receive(apiServiceDescriptor, ENDPOINT_B, CODER, inboundValuesB::add);

    // Show that out of order stream completion can occur.
    readFutureB.awaitCompletion();
    assertThat(inboundValuesB, contains(valueInGlobalWindow("JKL"), valueInGlobalWindow("MNO")));

    outboundServerObserver.get().onNext(ELEMENTS_A_2);
    readFutureA.awaitCompletion();
    assertThat(
        inboundValuesA,
        contains(
            valueInGlobalWindow("ABC"), valueInGlobalWindow("DEF"), valueInGlobalWindow("GHI")));
  } finally {
    server.shutdownNow();
  }
}
 
Example 17
Source File: BigQueryIOReadTest.java    From beam with Apache License 2.0 4 votes vote down vote up
/**
 * This test simulates the scenario where the SQL text which is executed by the query job doesn't
 * by itself refer to any tables (e.g. "SELECT 17 AS value"), and thus there are no referenced
 * tables when the dry run of the query is performed.
 */
@Test
public void testBigQueryQuerySourceInitSplit_NoReferencedTables() throws Exception {

  PipelineOptions options = PipelineOptionsFactory.create();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");

  Table queryResultTable =
      new Table()
          .setSchema(
              new TableSchema()
                  .setFields(
                      ImmutableList.of(
                          new TableFieldSchema().setName("name").setType("STRING"),
                          new TableFieldSchema().setName("number").setType("INTEGER"))));

  List<TableRow> expected =
      ImmutableList.of(
          new TableRow().set("name", "a").set("number", 1L),
          new TableRow().set("name", "b").set("number", 2L),
          new TableRow().set("name", "c").set("number", 3L),
          new TableRow().set("name", "d").set("number", 4L),
          new TableRow().set("name", "e").set("number", 5L),
          new TableRow().set("name", "f").set("number", 6L));

  String encodedQuery = FakeBigQueryServices.encodeQueryResult(queryResultTable, expected);

  String stepUuid = "testStepUuid";

  fakeJobService.expectDryRunQuery(
      bqOptions.getProject(),
      encodedQuery,
      new JobStatistics()
          .setQuery(
              new JobStatistics2()
                  .setTotalBytesProcessed(100L)
                  .setReferencedTables(ImmutableList.of())));

  BoundedSource<TableRow> bqSource =
      BigQueryQuerySourceDef.create(
              fakeBqServices,
              ValueProvider.StaticValueProvider.of(encodedQuery),
              true /* flattenResults */,
              true /* useLegacySql */,
              QueryPriority.BATCH,
              null,
              null,
              null)
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  options.setTempLocation(testFolder.getRoot().getAbsolutePath());

  List<TableRow> read =
      convertStringsToLong(
          SourceTestUtils.readFromSplitsOfSource(bqSource, 0L /* ignored */, options));
  assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));

  List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
  assertEquals(2, sources.size());
}
 
Example 18
Source File: IntrinsicMapTaskExecutorTest.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Verify counts for the per-element-output-time counter are correct. */
@Test
public void testPerElementProcessingTimeCounters() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  options
      .as(DataflowPipelineDebugOptions.class)
      .setExperiments(
          Lists.newArrayList(DataflowElementExecutionTracker.TIME_PER_ELEMENT_EXPERIMENT));
  DataflowExecutionStateTracker stateTracker =
      new DataflowExecutionStateTracker(
          ExecutionStateSampler.newForTest(),
          new TestDataflowExecutionState(
              NameContext.forStage("test-stage"),
              "other",
              null /* requestingStepName */,
              null /* sideInputIndex */,
              null /* metricsContainer */,
              NoopProfileScope.NOOP),
          counterSet,
          options,
          "test-work-item-id");
  NameContext parDoName = nameForStep("s1");

  // Wire a read operation with 3 elements to a ParDoOperation and assert that we count
  // the correct number of elements.
  ReadOperation read =
      ReadOperation.forTest(
          new TestReader("a", "b", "c"),
          new OutputReceiver(),
          TestOperationContext.create(counterSet, nameForStep("s0"), null, stateTracker));
  ParDoOperation parDo =
      new ParDoOperation(
          new NoopParDoFn(),
          new OutputReceiver[0],
          TestOperationContext.create(counterSet, parDoName, null, stateTracker));
  parDo.attachInput(read, 0);
  List<Operation> operations = Lists.newArrayList(read, parDo);

  try (IntrinsicMapTaskExecutor executor =
      IntrinsicMapTaskExecutor.withSharedCounterSet(operations, counterSet, stateTracker)) {
    executor.execute();
  }

  CounterName counterName =
      CounterName.named("per-element-processing-time").withOriginalName(parDoName);
  Counter<Long, CounterDistribution> counter =
      (Counter<Long, CounterDistribution>) counterSet.getExistingCounter(counterName);

  assertThat(counter.getAggregate().getCount(), equalTo(3L));
}
 
Example 19
Source File: ShuffleSinkTest.java    From beam with Apache License 2.0 4 votes vote down vote up
void runTestWriteGroupingSortingShuffleSink(List<KV<Integer, KV<String, Integer>>> expected)
    throws Exception {
  BatchModeExecutionContext executionContext =
      BatchModeExecutionContext.forTesting(PipelineOptionsFactory.create(), "STAGE");
  ShuffleSink<KV<Integer, KV<String, Integer>>> shuffleSink =
      new ShuffleSink<>(
          PipelineOptionsFactory.create(),
          null,
          ShuffleSink.ShuffleKind.GROUP_KEYS_AND_SORT_VALUES,
          WindowedValue.getFullCoder(
              KvCoder.of(
                  BigEndianIntegerCoder.of(),
                  KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of())),
              new GlobalWindows().windowCoder()),
          executionContext,
          TestOperationContext.create());

  TestShuffleWriter shuffleWriter = new TestShuffleWriter();
  List<Long> actualSizes = new ArrayList<>();
  try (Sink.SinkWriter<WindowedValue<KV<Integer, KV<String, Integer>>>> shuffleSinkWriter =
      shuffleSink.writer(shuffleWriter, "dataset")) {
    for (KV<Integer, KV<String, Integer>> kv : expected) {
      actualSizes.add(shuffleSinkWriter.add(WindowedValue.valueInGlobalWindow(kv)));
    }
  }

  List<ShuffleEntry> records = shuffleWriter.getRecords();

  List<KV<Integer, KV<String, Integer>>> actual = new ArrayList<>();
  for (ShuffleEntry record : records) {
    byte[] keyBytes = record.getKey();
    byte[] valueBytes = record.getValue();
    byte[] sortKeyBytes = record.getSecondaryKey();

    Integer key = CoderUtils.decodeFromByteArray(BigEndianIntegerCoder.of(), keyBytes);
    ByteArrayInputStream bais = new ByteArrayInputStream(sortKeyBytes);
    String sortKey = StringUtf8Coder.of().decode(bais);
    Integer sortValue = CoderUtils.decodeFromByteArray(BigEndianIntegerCoder.of(), valueBytes);

    actual.add(KV.of(key, KV.of(sortKey, sortValue)));
  }

  Assert.assertEquals(expected, actual);
  Assert.assertEquals(shuffleWriter.getSizes(), actualSizes);
}
 
Example 20
Source File: BigQueryServicesImplTest.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Tests that {@link DatasetServiceImpl#insertAll} fails gracefully when persistent issues. */
@Test
public void testInsertFailsGracefully() throws Exception {
  TableReference ref =
      new TableReference().setProjectId("project").setDatasetId("dataset").setTableId("table");
  List<ValueInSingleWindow<TableRow>> rows =
      ImmutableList.of(wrapValue(new TableRow()), wrapValue(new TableRow()));

  final TableDataInsertAllResponse row1Failed =
      new TableDataInsertAllResponse()
          .setInsertErrors(ImmutableList.of(new InsertErrors().setIndex(1L)));

  final TableDataInsertAllResponse row0Failed =
      new TableDataInsertAllResponse()
          .setInsertErrors(ImmutableList.of(new InsertErrors().setIndex(0L)));

  when(response.getContentType()).thenReturn(Json.MEDIA_TYPE);
  // Always return 200.
  when(response.getStatusCode()).thenReturn(200);
  // Return row 1 failing, then we retry row 1 as row 0, and row 0 persistently fails.
  when(response.getContent())
      .thenReturn(toStream(row1Failed))
      .thenAnswer(invocation -> toStream(row0Failed));

  DatasetServiceImpl dataService =
      new DatasetServiceImpl(bigquery, PipelineOptionsFactory.create());

  // Expect it to fail.
  try {
    dataService.insertAll(
        ref,
        rows,
        null,
        BackOffAdapter.toGcpBackOff(TEST_BACKOFF.backoff()),
        new MockSleeper(),
        InsertRetryPolicy.alwaysRetry(),
        null,
        null,
        false,
        false,
        false);
    fail();
  } catch (IOException e) {
    assertThat(e, instanceOf(IOException.class));
    assertThat(e.getMessage(), containsString("Insert failed:"));
    assertThat(e.getMessage(), containsString("[{\"index\":0}]"));
  }

  // Verify the exact number of retries as well as log messages.
  verify(response, times(4)).getStatusCode();
  verify(response, times(4)).getContent();
  verify(response, times(4)).getContentType();
  expectedLogs.verifyInfo("Retrying 1 failed inserts to BigQuery");
}