Java Code Examples for org.apache.beam.sdk.PipelineResult

The following examples show how to use org.apache.beam.sdk.PipelineResult. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: hop   Source File: BeamPipelineEngine.java    License: Apache License 2.0 6 votes vote down vote up
private PipelineResult executePipeline( org.apache.beam.sdk.Pipeline pipeline ) throws HopException {

    RunnerType runnerType = beamEngineRunConfiguration.getRunnerType();
    switch ( runnerType ) {
      case Direct:
        return DirectRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Flink:
        return FlinkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case DataFlow:
        return DataflowRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Spark:
        return SparkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      default:
        throw new HopException( "Execution on runner '" + runnerType.name() + "' is not supported yet." );
    }
  }
 
Example 2
Source Project: beam   Source File: MongoDBIOIT.java    License: Apache License 2.0 6 votes vote down vote up
private void collectAndPublishMetrics(PipelineResult writeResult, PipelineResult readResult) {
  String uuid = UUID.randomUUID().toString();
  String timestamp = Timestamp.now().toString();

  Set<Function<MetricsReader, NamedTestResult>> readSuppliers = getReadSuppliers(uuid, timestamp);
  Set<Function<MetricsReader, NamedTestResult>> writeSuppliers =
      getWriteSuppliers(uuid, timestamp);
  IOITMetrics readMetrics =
      new IOITMetrics(readSuppliers, readResult, NAMESPACE, uuid, timestamp);
  IOITMetrics writeMetrics =
      new IOITMetrics(writeSuppliers, writeResult, NAMESPACE, uuid, timestamp);
  readMetrics.publish(bigQueryDataset, bigQueryTable);
  readMetrics.publishToInflux(settings);
  writeMetrics.publish(bigQueryDataset, bigQueryTable);
  writeMetrics.publishToInflux(settings);
}
 
Example 3
Source Project: component-runtime   Source File: TalendIOTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void processorMulti() {
    final PCollection<SampleLength> out = pipeline
            .apply(Create.of(new Sample("a"), new Sample("bb")).withCoder(JsonbCoder.of(Sample.class, PLUGIN)))
            .apply(UUID.randomUUID().toString(), toRecord())
            .setCoder(SchemaRegistryCoder.of())
            .apply(new ViewsMappingTransform(emptyMap(), PLUGIN))
            .apply(TalendFn.asFn(new BaseTestProcessor() {

                @Override
                public void onNext(final InputFactory input, final OutputFactory factory) {
                    final Object read = input.read(Branches.DEFAULT_BRANCH);
                    factory
                            .create(Branches.DEFAULT_BRANCH)
                            .emit(new Sample(Record.class.cast(read).getString("data")));
                }
            }))
            .apply(toSampleLength());
    PAssert.that(out.apply(UUID.randomUUID().toString(), toInt())).containsInAnyOrder(1, 2);
    assertEquals(PipelineResult.State.DONE, pipeline.run().getState());
}
 
Example 4
Source Project: components   Source File: BigQueryDatasetRuntime.java    License: Apache License 2.0 6 votes vote down vote up
public void getSampleDeprecated(int limit, Consumer<IndexedRecord> consumer) {
    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    // Create an input runtime based on the properties.
    BigQueryInputRuntime inputRuntime = new BigQueryInputRuntime();
    BigQueryInputProperties inputProperties = new BigQueryInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(new BeamJobRuntimeContainer(options), inputProperties);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        PipelineResult pr = p.run();
        pr.waitUntilFinish();
    }
}
 
Example 5
@Test(timeout = 30000)
public void canReadPubsubInput() throws Exception {
  List<String> inputLines = Lines.resources("testdata/basic-messages-nonempty.ndjson");
  publishLines(inputLines);

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setInput(pipeline.newProvider(subscriptionName.toString()));

  PCollection<String> output = pipeline.apply(InputType.pubsub.read(sinkOptions))
      .apply("encodeJson", OutputFileFormat.json.encode());

  PAssert.that(output).containsInAnyOrder(inputLines);

  // This runs in the background and returns immediately due to setBlockOnRun above.
  PipelineResult result = pipeline.run();

  // The wait here is determined empirically; it's not entirely clear why it takes this long.
  System.err.println("Waiting 15 seconds to make sure we've processed all messages...");
  result.waitUntilFinish(Duration.millis(15000));
  System.err.println("Done waiting; now cancelling the pipeline so the test can finish.");
  result.cancel();
}
 
Example 6
@Test(timeout = 30000)
public void canSendPubsubOutput() throws Exception {
  final List<String> inputLines = Lines.resources("testdata/pubsub-integration/input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setOutput(pipeline.newProvider(topicName.toString()));
  // We would normally use pipeline.newProvider instead of StaticValueProvider in tests,
  // but something about this configuration causes the pipeline to stall when CompressPayload
  // accesses a method on the underlying enum value when defined via pipeline.newProvider.
  sinkOptions.setOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED));

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(OutputType.pubsub.write(sinkOptions));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/truncated.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}
 
Example 7
@Test(timeout = 30000)
public void canSendGzippedPayloads() throws Exception {
  final List<String> inputLines = Lines.resources("testdata/pubsub-integration/input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions sinkOptions = pipeline.getOptions().as(SinkOptions.class);
  sinkOptions.setOutputType(OutputType.pubsub);
  sinkOptions.setOutput(pipeline.newProvider(topicName.toString()));
  SinkOptions.Parsed options = SinkOptions.parseSinkOptions(sinkOptions);

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(options.getOutputType().write(options));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/gzipped.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}
 
Example 8
Source Project: beam   Source File: SpannerWriteIT.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testReportFailures() throws Exception {
  int numRecords = 100;
  p.apply(GenerateSequence.from(0).to(2 * numRecords))
      .apply(ParDo.of(new GenerateMutations(options.getTable(), new DivBy2())))
      .apply(
          SpannerIO.write()
              .withProjectId(project)
              .withInstanceId(options.getInstanceId())
              .withDatabaseId(databaseName)
              .withFailureMode(SpannerIO.FailureMode.REPORT_FAILURES));

  PipelineResult result = p.run();
  result.waitUntilFinish();
  assertThat(result.getState(), is(PipelineResult.State.DONE));
  assertThat(countNumberOfRecords(), equalTo((long) numRecords));
}
 
Example 9
Source Project: DataflowTemplates   Source File: ParquetToBigtable.java    License: Apache License 2.0 6 votes vote down vote up
public static PipelineResult run(Options options) {
  Pipeline pipeline = Pipeline.create(options);

  BigtableIO.Write write =
          BigtableIO.write()
                  .withProjectId(options.getBigtableProjectId())
                  .withInstanceId(options.getBigtableInstanceId())
                  .withTableId(options.getBigtableTableId());

  /**
   * Steps: 1) Read records from Parquet File. 2) Convert a GenericRecord to a
   * KV<ByteString,Iterable<Mutation>>. 3) Write KV to Bigtable's table.
   */
  pipeline
      .apply(
          "Read from Parquet",
          ParquetIO.read(BigtableRow.getClassSchema()).from(options.getInputFilePattern()))
      .apply(
          "Transform to Bigtable",
          ParDo.of(
              ParquetToBigtableFn.createWithSplitLargeRows(
                  options.getSplitLargeRows(), MAX_MUTATIONS_PER_ROW)))
      .apply("Write to Bigtable", write);

  return pipeline.run();
}
 
Example 10
Source Project: beam   Source File: DirectRunnerTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWaitUntilFinishTimeout() throws Exception {
  DirectOptions options = PipelineOptionsFactory.as(DirectOptions.class);
  options.setBlockOnRun(false);
  options.setRunner(DirectRunner.class);
  Pipeline p = Pipeline.create(options);
  p.apply(Create.of(1L))
      .apply(
          ParDo.of(
              new DoFn<Long, Long>() {
                @ProcessElement
                public void hang(ProcessContext context) throws InterruptedException {
                  // Hangs "forever"
                  Thread.sleep(Long.MAX_VALUE);
                }
              }));
  PipelineResult result = p.run();
  // The pipeline should never complete;
  assertThat(result.getState(), is(State.RUNNING));
  // Must time out, otherwise this test will never complete
  result.waitUntilFinish(Duration.millis(1L));
  assertEquals(null, result.getState());
}
 
Example 11
Source Project: component-runtime   Source File: IndexedRecordToJsonTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test() {
    PAssert
            .that(pipeline
                    .apply(Create
                            .of(newIndexedRecord("first"), newIndexedRecord("second"))
                            .withCoder(AvroCoder.of(IndexedRecord.class, getSchema())))
                    .apply(new IndexedRecordToJson()))
            .satisfies(values -> {
                assertEquals(asList("first", "second"),
                        StreamSupport
                                .stream(values.spliterator(), false)
                                .map(k -> k.getString("name"))
                                .sorted()
                                .collect(toList()));
                return null;
            });
    assertEquals(PipelineResult.State.DONE, pipeline.run().waitUntilFinish());
}
 
Example 12
Source Project: DataflowTemplates   Source File: TextToPubsub.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
      .apply("Read Text Data", TextIO.read().from(options.getInputFilePattern()))
      .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example 13
Source Project: component-runtime   Source File: TalendIOTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void output() {
    Output.DATA.clear();
    pipeline
            .apply(Create.of(new Sample("a"), new Sample("b")).withCoder(JsonbCoder.of(Sample.class, PLUGIN)))
            .apply(UUID.randomUUID().toString(), toRecord())
            .setCoder(SchemaRegistryCoder.of())
            .apply(new ViewsMappingTransform(emptyMap(), PLUGIN))
            .apply(TalendIO.write(new BaseTestProcessor() {

                @Override
                public void onNext(final InputFactory input, final OutputFactory factory) {
                    final Object read = input.read(Branches.DEFAULT_BRANCH);
                    Output.DATA.add(Record.class.cast(read).getString("data"));
                }
            }));
    assertEquals(PipelineResult.State.DONE, pipeline.run().getState());
    assertThat(Output.DATA, containsInAnyOrder("a", "b"));
}
 
Example 14
Source Project: DataflowTemplates   Source File: WordCountTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testWordCountSimple() {
  PCollection<KV<String, Long>> pc =
      pipeline.apply(Create.of(INPUT_STRS)).apply(new CountWords());
  PAssert.that(pc).containsInAnyOrder(KV.of("hello", 2L), KV.of(("world"), 1L));
  PipelineResult result = pipeline.run();
  result.waitUntilFinish();

  Map<String, Long> expectedCounters = new HashMap<>();
  expectedCounters.put("emptyLines", 2L);
  for (MetricResult c :
      result.metrics().queryMetrics(MetricsFilter.builder().build()).getCounters()) {
    String name = c.getName().getName();
    if (expectedCounters.containsKey(name)) {
      assertEquals(expectedCounters.get(name), c.getCommitted());
      expectedCounters.remove(name);
    }
  }
  assertTrue(expectedCounters.isEmpty());
}
 
Example 15
Source Project: DataflowTemplates   Source File: ExportTimestampTest.java    License: Apache License 2.0 6 votes vote down vote up
private void exportAndImportDbAtTime(String sourceDb, String destDb,
                                     String jobIdName, String ts,
                                     TestPipeline exportPipeline,
                                     TestPipeline importPipeline) {
  ValueProvider.StaticValueProvider<String> destination = ValueProvider.StaticValueProvider
      .of(tmpDir);
  ValueProvider.StaticValueProvider<String> jobId = ValueProvider.StaticValueProvider
      .of(jobIdName);
  ValueProvider.StaticValueProvider<String> source = ValueProvider.StaticValueProvider
      .of(tmpDir + "/" + jobIdName);
  ValueProvider.StaticValueProvider<String> timestamp = ValueProvider.StaticValueProvider.of(ts);
  SpannerConfig sourceConfig = spannerServer.getSpannerConfig(sourceDb);
  exportPipeline.apply("Export", new ExportTransform(sourceConfig, destination,
                                                     jobId, timestamp));
  PipelineResult exportResult = exportPipeline.run();
  exportResult.waitUntilFinish();

  SpannerConfig copyConfig = spannerServer.getSpannerConfig(destDb);
  importPipeline.apply("Import", new ImportTransform(
      copyConfig, source, ValueProvider.StaticValueProvider.of(true),
      ValueProvider.StaticValueProvider.of(true),
      ValueProvider.StaticValueProvider.of(true)));
  PipelineResult importResult = importPipeline.run();
  importResult.waitUntilFinish();
}
 
Example 16
Source Project: kettle-beam   Source File: KettleBeamPipelineExecutor.java    License: Apache License 2.0 5 votes vote down vote up
private PipelineResult asyncExecutePipeline( Pipeline pipeline ) throws KettleException {

    RunnerType runnerType = RunnerType.getRunnerTypeByName( transMeta.environmentSubstitute( jobConfig.getRunnerTypeName() ) );
    if (runnerType==null) {
      throw new KettleException( "Runner type '"+jobConfig.getRunnerTypeName()+"' is not recognized");
    }
    switch ( runnerType ) {
      case Direct: return DirectRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Flink: return FlinkRunner.fromOptions(pipeline.getOptions()).run( pipeline );
      case DataFlow: return DataflowRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Spark: return SparkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      default:
        throw new KettleException( "Execution on runner '" + runnerType.name() + "' is not supported yet, sorry." );
    }
  }
 
Example 17
Source Project: kettle-beam   Source File: KettleBeamPipelineExecutor.java    License: Apache License 2.0 5 votes vote down vote up
private void logMetrics( PipelineResult pipelineResult ) {
  MetricResults metricResults = pipelineResult.metrics();

  logChannel.logBasic( "  ----------------- Metrics refresh @ " + new SimpleDateFormat( "yyyy/MM/dd HH:mm:ss" ).format( new Date() ) + " -----------------------" );

  MetricQueryResults allResults = metricResults.queryMetrics( MetricsFilter.builder().build() );
  for ( MetricResult<Long> result : allResults.getCounters() ) {
    logChannel.logBasic( "Name: " + result.getName() + " Attempted: " + result.getAttempted() );
  }
}
 
Example 18
Source Project: beam   Source File: KafkaIOIT.java    License: Apache License 2.0 5 votes vote down vote up
private void cancelIfTimeouted(PipelineResult readResult, PipelineResult.State readState)
    throws IOException {

  // TODO(lgajowy) this solution works for dataflow only - it returns null when
  //  waitUntilFinish(Duration duration) exceeds provided duration.
  if (readState == null) {
    readResult.cancel();
  }
}
 
Example 19
Source Project: gcp-ingestion   Source File: Sink.java    License: Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 *
 * @param args command line arguments
 */
public static PipelineResult run(String[] args) {
  registerOptions();
  final SinkOptions.Parsed options = SinkOptions.parseSinkOptions(
      PipelineOptionsFactory.fromArgs(args).withValidation().as(SinkOptions.class));

  return run(options);
}
 
Example 20
Source Project: beam   Source File: NonMergingGroupByKeyTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testEnabledReIterationDoesNotThrowAnException() {
  final Pipeline p = FlinkTestPipeline.createForBatch();
  p.getOptions().as(FlinkPipelineOptions.class).setReIterableGroupByKeyResult(true);
  p.apply(Create.of(Arrays.asList(KV.of("a", 1), KV.of("b", 2), KV.of("c", 3))))
      .apply(GroupByKey.create())
      .apply(ParDo.of(new ReiterateDoFn<>()));
  final PipelineResult.State state = p.run().waitUntilFinish();
  Assert.assertEquals(PipelineResult.State.DONE, state);
}
 
Example 21
Source Project: gcp-ingestion   Source File: Decoder.java    License: Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 *
 * @param args command line arguments
 */
public static PipelineResult run(String[] args) {
  registerOptions(); // Defined in Sink.java
  final DecoderOptions.Parsed options = DecoderOptions.parseDecoderOptions(
      PipelineOptionsFactory.fromArgs(args).withValidation().as(DecoderOptions.class));
  return run(options);
}
 
Example 22
Source Project: gcp-ingestion   Source File: Republisher.java    License: Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 *
 * @param args command line arguments
 */
public static PipelineResult run(String[] args) {
  registerOptions(); // Defined in Sink.java
  final RepublisherOptions.Parsed options = RepublisherOptions.parseRepublisherOptions(
      PipelineOptionsFactory.fromArgs(args).withValidation().as(RepublisherOptions.class));
  return run(options);
}
 
Example 23
Source Project: beam   Source File: MetricsTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
@Category({NeedsRunner.class, UsesAttemptedMetrics.class, UsesCounterMetrics.class})
public void testUnboundedSourceMetrics() {
  long numElements = 1000;

  // Use withMaxReadTime to force unbounded mode.
  pipeline.apply(
      GenerateSequence.from(0).to(numElements).withMaxReadTime(Duration.standardDays(1)));

  PipelineResult pipelineResult = pipeline.run();

  MetricQueryResults metrics =
      pipelineResult
          .metrics()
          .queryMetrics(
              MetricsFilter.builder()
                  .addNameFilter(
                      MetricNameFilter.named(
                          ELEMENTS_READ.getNamespace(), ELEMENTS_READ.getName()))
                  .build());

  assertThat(
      metrics.getCounters(),
      hasItem(
          attemptedMetricsResult(
              ELEMENTS_READ.getNamespace(),
              ELEMENTS_READ.getName(),
              "Read(UnboundedCountingSource)",
              1000L)));
}
 
Example 24
Source Project: gcp-ingestion   Source File: IpPrivacyDecoder.java    License: Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 */
public static PipelineResult run(IpPrivacyDecoderOptions.Parsed options) {
  final Pipeline pipeline = Pipeline.create(options);
  final List<PCollection<PubsubMessage>> errorCollections = new ArrayList<>();

  // We wrap pipeline in Optional for more convenience in chaining together transforms.
  Optional.of(pipeline) //
      .map(p -> p //
          .apply(options.getInputType().read(options)) //
          .apply(ParseUri.of()).failuresTo(errorCollections) //
          .apply("RestrictToMainPings",
              Filter
                  .by((message) -> "main".equals(message.getAttribute(Attribute.DOCUMENT_TYPE))))
          .apply(ParseProxy.of()) //
          .apply(ParseIp.of()) //
          .apply(GeoCityLookup.of(options.getGeoCityDatabase(), options.getGeoCityFilter())) //
          .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) //
          .apply(ExtractClientIdAndDropPayload.of()).failuresTo(errorCollections) //
          .apply(HashClientInfo.of(options.getClientIdHashKey(), options.getClientIpHashKey())) //
          .apply(NormalizeAttributes.of())) //
      .map(p -> p //
          .apply(RemoveAttributes.of()) //
          .apply(options.getOutputType().write(options)).failuresTo(errorCollections));

  // Write error output collections.
  PCollectionList.of(errorCollections) //
      .apply("FlattenErrorCollections", Flatten.pCollections()) //
      .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) //
      .output();

  return pipeline.run();
}
 
Example 25
/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Test
public void testBasicDefaultsUnbounded() throws IOException, URISyntaxException {
    String fileSpec = mini
            .getLocalFs()
            .getUri()
            .resolve(new Path(mini.newFolder().toString(), "output.csv").toUri())
            .toString();

    // Configure the component.
    SimpleFileIOOutputProperties props = createOutputComponentProperties();
    props.getDatasetProperties().path.setValue(fileSpec);

    // Create the runtime.
    SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
    runtime.initialize(null, props);

    // Use the runtime in a direct pipeline to test.
    final Pipeline p = beam.createPipeline();
    PCollection<IndexedRecord> input = p //
            .apply(GenerateSequence.from(0).withRate(10, Duration.millis(1000))) //
            .apply(ParDo.of(new GenerateDoFn()));
    input.apply(runtime);

    // And run the test.
    PipelineResult pr = p.run();

    // Check the expected values.
    mini.assertReadFile(mini.getLocalFs(), fileSpec, "1;one", "2;two");
}
 
Example 26
@Test
public void canWriteToDynamicTables() throws Exception {
  String table = "my_test_table";
  TableId tableId = TableId.of(dataset, table);

  bigquery.create(DatasetInfo.newBuilder(dataset).build());
  bigquery.create(TableInfo.newBuilder(tableId,
      StandardTableDefinition.of(Schema.of(Field.of("client_id", LegacySQLTypeName.STRING),
          Field.of("type", LegacySQLTypeName.STRING))))
      .build());

  String input = Resources
      .getResource("testdata/bigquery-integration/input-varied-doctypes.ndjson").getPath();
  String output = String.format("%s:%s.%s", projectId, dataset, "${document_type}_table");
  String errorOutput = outputPath + "/error/out";

  PipelineResult result = Sink.run(new String[] { "--inputFileFormat=json", "--inputType=file",
      "--input=" + input, "--outputType=bigquery", "--output=" + output,
      "--bqWriteMethod=streaming", "--errorOutputType=file", "--schemasLocation=schemas.tar.gz",
      "--errorOutputFileCompression=UNCOMPRESSED", "--errorOutput=" + errorOutput });

  result.waitUntilFinish();

  String tableSpec = String.format("%s.%s", dataset, table);
  assertThat(stringValuesQueryWithRetries("SELECT client_id FROM " + tableSpec),
      matchesInAnyOrder(ImmutableList.of("abc123")));

  List<String> errorOutputLines = Lines.files(outputPath + "/error/out*.ndjson");
  assertThat(errorOutputLines, Matchers.hasSize(2));
}
 
Example 27
@Test
public void canWriteViaFileLoads() throws Exception {
  String table = "my_test_table";
  TableId tableId = TableId.of(dataset, table);

  bigquery.create(DatasetInfo.newBuilder(dataset).build());
  bigquery
      .create(TableInfo
          .newBuilder(tableId,
              StandardTableDefinition
                  .of(Schema.of(Field.of("client_id", LegacySQLTypeName.STRING),
                      Field.of("type", LegacySQLTypeName.STRING),
                      Field.of("submission_timestamp", LegacySQLTypeName.TIMESTAMP)))
                  .toBuilder().setTimePartitioning(TIME_PARTITIONING).setClustering(CLUSTERING)
                  .build())
          .build());

  String input = Resources
      .getResource("testdata/bigquery-integration/input-varied-doctypes.ndjson").getPath();
  String output = String.format("%s:%s.%s", projectId, dataset, "${document_type}_table");
  String errorOutput = outputPath + "/error/out";

  PipelineResult result = Sink.run(new String[] { "--inputFileFormat=json", "--inputType=file",
      "--input=" + input, "--outputType=bigquery", "--output=" + output,
      "--bqWriteMethod=file_loads", "--errorOutputType=file",
      "--tempLocation=gs://gcp-ingestion-static-test-bucket/temp/bq-loads",
      "--schemasLocation=schemas.tar.gz", "--errorOutputFileCompression=UNCOMPRESSED",
      "--errorOutput=" + errorOutput });

  result.waitUntilFinish();

  String tableSpec = String.format("%s.%s", dataset, table);
  assertThat(stringValuesQueryWithRetries("SELECT client_id FROM " + tableSpec),
      matchesInAnyOrder(ImmutableList.of("abc123")));

  List<String> errorOutputLines = Lines.files(outputPath + "/error/out*.ndjson");
  assertThat(errorOutputLines, Matchers.hasSize(2));
}
 
Example 28
private void canWriteWithMixedMethod(String streamingDocTypes) throws Exception {
  String table = "my_test_table";
  TableId tableId = TableId.of(dataset, table);

  bigquery.create(DatasetInfo.newBuilder(dataset).build());
  bigquery
      .create(TableInfo
          .newBuilder(tableId,
              StandardTableDefinition
                  .of(Schema.of(Field.of("client_id", LegacySQLTypeName.STRING),
                      Field.of("type", LegacySQLTypeName.STRING),
                      Field.of("submission_timestamp", LegacySQLTypeName.TIMESTAMP)))
                  .toBuilder().setTimePartitioning(TIME_PARTITIONING).setClustering(CLUSTERING)
                  .build())
          .build());

  String input = Resources
      .getResource("testdata/bigquery-integration/input-varied-doctypes.ndjson").getPath();
  String output = String.format("%s:%s.%s", projectId, dataset, "${document_type}_table");
  String errorOutput = outputPath + "/error/out";

  PipelineResult result = Sink.run(new String[] { "--inputFileFormat=json", "--inputType=file",
      "--input=" + input, "--outputType=bigquery", "--output=" + output, "--bqWriteMethod=mixed",
      "--bqStreamingDocTypes=" + streamingDocTypes, "--errorOutputType=file",
      "--tempLocation=gs://gcp-ingestion-static-test-bucket/temp/bq-loads",
      "--schemasLocation=schemas.tar.gz", "--errorOutputFileCompression=UNCOMPRESSED",
      "--errorOutput=" + errorOutput });

  result.waitUntilFinish();

  String tableSpec = String.format("%s.%s", dataset, table);
  assertThat(stringValuesQueryWithRetries("SELECT client_id FROM " + tableSpec),
      matchesInAnyOrder(ImmutableList.of("abc123")));

  List<String> errorOutputLines = Lines.files(outputPath + "/error/out*.ndjson");
  assertThat(errorOutputLines, Matchers.hasSize(2));
}
 
Example 29
Source Project: beam   Source File: SparkPipelineResult.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected State awaitTermination(final Duration duration)
    throws TimeoutException, ExecutionException, InterruptedException {
  if (duration.getMillis() > 0) {
    pipelineExecution.get(duration.getMillis(), TimeUnit.MILLISECONDS);
  } else {
    pipelineExecution.get();
  }
  return PipelineResult.State.DONE;
}
 
Example 30
Source Project: beam   Source File: TFRecordIOIT.java    License: Apache License 2.0 5 votes vote down vote up
private static double getRunTime(
    final PipelineResult writeResults, final PipelineResult readResult) {
  final long startTime =
      MetricsReader.ofResults(writeResults, TFRECORD_NAMESPACE).getStartTimeMetric(WRITE_TIME);
  final long endTime =
      MetricsReader.ofResults(readResult, TFRECORD_NAMESPACE).getEndTimeMetric(READ_TIME);
  return (endTime - startTime) / 1e3;
}