Java Code Examples for org.apache.beam.sdk.PipelineResult#waitUntilFinish()

The following examples show how to use org.apache.beam.sdk.PipelineResult#waitUntilFinish() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordCountTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testWordCountSimple() {
  PCollection<KV<String, Long>> pc =
      pipeline.apply(Create.of(INPUT_STRS)).apply(new CountWords());
  PAssert.that(pc).containsInAnyOrder(KV.of("hello", 2L), KV.of(("world"), 1L));
  PipelineResult result = pipeline.run();
  result.waitUntilFinish();

  Map<String, Long> expectedCounters = new HashMap<>();
  expectedCounters.put("emptyLines", 2L);
  for (MetricResult c :
      result.metrics().queryMetrics(MetricsFilter.builder().build()).getCounters()) {
    String name = c.getName().getName();
    if (expectedCounters.containsKey(name)) {
      assertEquals(expectedCounters.get(name), c.getCommitted());
      expectedCounters.remove(name);
    }
  }
  assertTrue(expectedCounters.isEmpty());
}
 
Example 2
Source File: WordCountTimeOut1Sec.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Main function for the MR BEAM program.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = NemoPipelineOptionsFactory.create();
  options.setJobName("WordCountTimeOut1Sec");

  final Pipeline p = generateWordCountPipeline(options, inputFilePath, outputFilePath);
  final PipelineResult pr = p.run();
  final PipelineResult.State running = pr.waitUntilFinish(org.joda.time.Duration.standardSeconds(1));
  try {
    final PipelineResult.State cancelled = pr.cancel();
  } catch (final IOException e) {
    LOG.info("IOException while cancelling job");
  }
}
 
Example 3
Source File: TestSamzaRunner.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PipelineResult run(Pipeline pipeline) {
  try {
    final PipelineResult result = delegate.run(pipeline);
    result.waitUntilFinish();
    return result;
  } catch (Throwable t) {
    // Search for AssertionError. If present use it as the cause of the pipeline failure.
    Throwable current = t;

    while (current != null) {
      if (current instanceof AssertionError) {
        throw (AssertionError) current;
      }
      current = current.getCause();
    }

    throw t;
  }
}
 
Example 4
Source File: BigQueryDatasetRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
public void getSampleDeprecated(int limit, Consumer<IndexedRecord> consumer) {
    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    // Create an input runtime based on the properties.
    BigQueryInputRuntime inputRuntime = new BigQueryInputRuntime();
    BigQueryInputProperties inputProperties = new BigQueryInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(new BeamJobRuntimeContainer(options), inputProperties);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        PipelineResult pr = p.run();
        pr.waitUntilFinish();
    }
}
 
Example 5
Source File: TextImportPipeline.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

    Pipeline p = Pipeline.create(options);

    SpannerConfig spannerConfig =
        SpannerConfig.create()
            .withHost(options.getSpannerHost())
            .withInstanceId(options.getInstanceId())
            .withDatabaseId(options.getDatabaseId());

    p.apply(new TextImportTransform(spannerConfig, options.getImportManifest()));

    PipelineResult result = p.run();
    if (options.getWaitUntilFinish()
        &&
        /* Only if template location is null, there is a dataflow job to wait for. Otherwise it's
         * template generation, which doesn't start a dataflow job.
         */
        options.as(DataflowPipelineOptions.class).getTemplateLocation() == null) {
      result.waitUntilFinish();
    }
  }
 
Example 6
Source File: SpannerWriteIT.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testFailFast() throws Exception {
  thrown.expect(new StackTraceContainsString("SpannerException"));
  thrown.expect(new StackTraceContainsString("Value must not be NULL in table users"));
  int numRecords = 100;
  p.apply(GenerateSequence.from(0).to(2 * numRecords))
      .apply(ParDo.of(new GenerateMutations(options.getTable(), new DivBy2())))
      .apply(
          SpannerIO.write()
              .withProjectId(project)
              .withInstanceId(options.getInstanceId())
              .withDatabaseId(databaseName));

  PipelineResult result = p.run();
  result.waitUntilFinish();
}
 
Example 7
Source File: QueryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Test {@code query} matches {@code model}. */
private <T extends KnownSize> void queryMatchesModel(
    String name,
    NexmarkQueryTransform<T> query,
    NexmarkQueryModel<T> model,
    boolean streamingMode) {
  NexmarkUtils.setupPipeline(NexmarkUtils.CoderStrategy.HAND, p);

  PCollection<Event> events =
      p.apply(
          name + ".Read",
          streamingMode
              ? NexmarkUtils.streamEventsSource(CONFIG)
              : NexmarkUtils.batchEventsSource(CONFIG));
  PCollection<TimestampedValue<T>> results =
      (PCollection<TimestampedValue<T>>) events.apply(new NexmarkQuery<>(CONFIG, query));
  PAssert.that(results).satisfies(model.assertionFor());
  PipelineResult result = p.run();
  result.waitUntilFinish();
}
 
Example 8
Source File: CopyDbTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
private void runTest() {
  String tmpDirPath = tmpDir.getRoot().getAbsolutePath();
  ValueProvider.StaticValueProvider<String> destination = ValueProvider.StaticValueProvider
      .of(tmpDirPath);
  ValueProvider.StaticValueProvider<String> jobId = ValueProvider.StaticValueProvider
      .of("jobid");
  ValueProvider.StaticValueProvider<String> source = ValueProvider.StaticValueProvider
      .of(tmpDirPath + "/jobid");

  SpannerConfig sourceConfig = spannerServer.getSpannerConfig(sourceDb);
  exportPipeline.apply("Export", new ExportTransform(sourceConfig, destination, jobId));
  PipelineResult exportResult = exportPipeline.run();
  exportResult.waitUntilFinish();

  SpannerConfig destConfig = spannerServer.getSpannerConfig(destinationDb);
  importPipeline.apply(
      "Import",
      new ImportTransform(
          destConfig,
          source,
          ValueProvider.StaticValueProvider.of(true),
          ValueProvider.StaticValueProvider.of(true),
          ValueProvider.StaticValueProvider.of(true)));
  PipelineResult importResult = importPipeline.run();
  importResult.waitUntilFinish();

  PCollection<Long> mismatchCount =
      comparePipeline.apply("Compare", new CompareDatabases(sourceConfig, destConfig));
  PAssert.that(mismatchCount).satisfies((x) -> {
    assertEquals(Lists.newArrayList(x), Lists.newArrayList(0L));
    return null;
  });
  PipelineResult compareResult = comparePipeline.run();
  compareResult.waitUntilFinish();

  Ddl sourceDdl = readDdl(sourceDb);
  Ddl destinationDdl = readDdl(destinationDb);

  assertThat(sourceDdl.prettyPrint(), equalToIgnoringWhiteSpace(destinationDdl.prettyPrint()));
}
 
Example 9
Source File: ImportPipeline.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

    Pipeline p = Pipeline.create(options);

    SpannerConfig spannerConfig =
        SpannerConfig.create()
            .withProjectId(options.getSpannerProjectId())
            .withHost(options.getSpannerHost())
            .withInstanceId(options.getInstanceId())
            .withDatabaseId(options.getDatabaseId());

    p.apply(
        new ImportTransform(
            spannerConfig,
            options.getInputDir(),
            options.getWaitForIndexes(),
            options.getWaitForForeignKeys(),
            options.getEarlyIndexCreateFlag()));

    PipelineResult result = p.run();
    if (options.getWaitUntilFinish() &&
        /* Only if template location is null, there is a dataflow job to wait for. Else it's
         * template generation which doesn't start a dataflow job.
         */
        options.as(DataflowPipelineOptions.class).getTemplateLocation() == null) {
      result.waitUntilFinish();
    }
  }
 
Example 10
Source File: ExportPipeline.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs a pipeline to export a Cloud Spanner database to Avro files.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {

  ExportPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(ExportPipelineOptions.class);

  Pipeline p = Pipeline.create(options);

  SpannerConfig spannerConfig =
      SpannerConfig.create()
          .withProjectId(options.getSpannerProjectId())
          .withHost(options.getSpannerHost())
          .withInstanceId(options.getInstanceId())
          .withDatabaseId(options.getDatabaseId());
  p.begin()
      .apply(
          "Run Export",
          new ExportTransform(spannerConfig, options.getOutputDir(), options.getTestJobId(),
                              options.getSnapshotTime()));
  PipelineResult result = p.run();
  if (options.getWaitUntilFinish() &&
      /* Only if template location is null, there is a dataflow job to wait for. Else it's
       * template generation which doesn't start a dataflow job.
       */
      options.as(DataflowPipelineOptions.class).getTemplateLocation() == null) {
    result.waitUntilFinish();
  }
}
 
Example 11
Source File: BigtableToParquet.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Main entry point for pipeline execution.
 *
 * @param args Command line arguments to the pipeline.
 */
public static void main(String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

  PipelineResult result = run(options);

  // Wait for pipeline to finish only if it is not constructing a template.
  if (options.as(DataflowPipelineOptions.class).getTemplateLocation() == null) {
    result.waitUntilFinish();
  }
}
 
Example 12
Source File: HadoopFormatIOIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void writeAndReadUsingHadoopFormat() {
  writePipeline
      .apply("Generate sequence", GenerateSequence.from(0).to(numberOfRows))
      .apply("Produce db rows", ParDo.of(new TestRow.DeterministicallyConstructTestRowFn()))
      .apply("Prevent fusion before writing", Reshuffle.viaRandomKey())
      .apply("Collect write time", ParDo.of(new TimeMonitor<>(NAMESPACE, "write_time")))
      .apply("Construct rows for DBOutputFormat", ParDo.of(new ConstructDBOutputFormatRowFn()))
      .apply(
          "Write using Hadoop OutputFormat",
          HadoopFormatIO.<TestRowDBWritable, NullWritable>write()
              .withConfiguration(hadoopConfiguration.get())
              .withPartitioning()
              .withExternalSynchronization(
                  new HDFSSynchronization(tmpFolder.getRoot().getAbsolutePath())));

  PipelineResult writeResult = writePipeline.run();
  writeResult.waitUntilFinish();

  PCollection<String> consolidatedHashcode =
      readPipeline
          .apply(
              "Read using Hadoop InputFormat",
              HadoopFormatIO.<LongWritable, TestRowDBWritable>read()
                  .withConfiguration(hadoopConfiguration.get()))
          .apply("Collect read time", ParDo.of(new TimeMonitor<>(NAMESPACE, "read_time")))
          .apply("Get values only", Values.create())
          .apply("Values as string", ParDo.of(new TestRow.SelectNameFn()))
          .apply("Calculate hashcode", Combine.globally(new HashingFn()));

  PAssert.thatSingleton(consolidatedHashcode).isEqualTo(getExpectedHashForRowCount(numberOfRows));

  PipelineResult readResult = readPipeline.run();
  readResult.waitUntilFinish();

  collectAndPublishMetrics(writeResult, readResult);
}
 
Example 13
Source File: GcsKmsKeyIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests writing to tempLocation with --dataflowKmsKey set on the command line. Verifies that
 * resulting output uses specified key and is readable. Does not verify any temporary files.
 *
 * <p>This test verifies that GCS file copies work with CMEK-enabled files.
 */
@Test
public void testGcsWriteWithKmsKey() {
  TestPipelineOptions options =
      TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
  assertNotNull(options.getTempRoot());
  options.setTempLocation(options.getTempRoot() + "/testGcsWriteWithKmsKey");
  GcsOptions gcsOptions = options.as(GcsOptions.class);

  ResourceId filenamePrefix =
      FileSystems.matchNewResource(gcsOptions.getGcpTempLocation(), true)
          .resolve(
              String.format("GcsKmsKeyIT-%tF-%<tH-%<tM-%<tS-%<tL.output", new Date()),
              StandardResolveOptions.RESOLVE_FILE);

  Pipeline p = Pipeline.create(options);
  p.apply("ReadLines", TextIO.read().from(INPUT_FILE))
      .apply("WriteLines", TextIO.write().to(filenamePrefix));

  PipelineResult result = p.run();
  State state = result.waitUntilFinish();
  assertThat(state, equalTo(State.DONE));

  String filePattern = filenamePrefix + "*-of-*";
  assertThat(new NumberedShardedFile(filePattern), fileContentsHaveChecksum(EXPECTED_CHECKSUM));

  // Verify objects have KMS key set.
  try {
    MatchResult matchResult =
        Iterables.getOnlyElement(FileSystems.match(Collections.singletonList(filePattern)));
    GcsUtil gcsUtil = gcsOptions.getGcsUtil();
    for (Metadata metadata : matchResult.metadata()) {
      String kmsKey =
          gcsUtil.getObject(GcsPath.fromUri(metadata.resourceId().toString())).getKmsKeyName();
      assertNotNull(kmsKey);
    }
  } catch (IOException e) {
    throw new AssertionError(e);
  }
}
 
Example 14
Source File: BigQueryIntegrationTest.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Test
public void canRecoverFailedInsertsInStreamingMode() throws Exception {
  String table = "my_test_table";
  String tableSpec = String.format("%s.%s", dataset, table);
  TableId tableId = TableId.of(dataset, table);

  bigquery.create(DatasetInfo.newBuilder(dataset).build());

  bigquery.create(TableInfo.newBuilder(tableId,
      StandardTableDefinition.of(Schema.of(Field.of("client_id", LegacySQLTypeName.STRING),
          Field.newBuilder("extra_required_field", LegacySQLTypeName.STRING)
              .setMode(Mode.REQUIRED).build())))
      .build());

  String input = Resources.getResource("testdata/json-payload.ndjson").getPath();
  String output = String.format("%s:%s", projectId, tableSpec);
  String errorOutput = outputPath + "/error/out";

  PipelineResult result = Sink.run(new String[] { "--inputFileFormat=text", "--inputType=file",
      "--input=" + input, "--outputType=bigquery", "--output=" + output, "--errorOutputType=file",
      "--bqWriteMethod=streaming", "--errorOutputFileCompression=UNCOMPRESSED",
      "--errorOutput=" + errorOutput });

  result.waitUntilFinish();

  assertTrue(stringValuesQuery("SELECT client_id FROM " + tableSpec).isEmpty());

  List<String> expectedErrorLines = Lines.resources("testdata/json-payload-wrapped.ndjson");
  List<String> errorOutputLines = Lines.files(outputPath + "/error/out*.ndjson");
  assertThat(errorOutputLines, Matchers.hasSize(expectedErrorLines.size()));
}
 
Example 15
Source File: DirectRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void tearsDownFnsBeforeFinishing() {
  TEARDOWN_CALL.set(-1);
  final Pipeline pipeline = getPipeline();
  pipeline
      .apply(Create.of("a"))
      .apply(
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void onElement(final ProcessContext ctx) {
                  // no-op
                }

                @Teardown
                public void teardown() {
                  // just to not have a fast execution hiding an issue until we have a shutdown
                  // callback
                  try {
                    Thread.sleep(1000);
                  } catch (final InterruptedException e) {
                    throw new AssertionError(e);
                  }
                  TEARDOWN_CALL.set(System.nanoTime());
                }
              }));
  final PipelineResult pipelineResult = pipeline.run();
  pipelineResult.waitUntilFinish();

  final long doneTs = System.nanoTime();
  final long tearDownTs = TEARDOWN_CALL.get();
  assertThat(tearDownTs, greaterThan(0L));
  assertThat(doneTs, greaterThan(tearDownTs));
}
 
Example 16
Source File: PipelineTestBase.java    From kettle-beam with Apache License 2.0 5 votes vote down vote up
@Ignore
public void createRunPipeline( TransMeta transMeta ) throws Exception {

  /*
  FileOutputStream fos = new FileOutputStream( "/tmp/"+transMeta.getName()+".ktr" );
  fos.write( transMeta.getXML().getBytes() );
  fos.close();
  */

  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();

  pipelineOptions.setJobName( transMeta.getName() );
  pipelineOptions.setUserAgent( BeamConst.STRING_KETTLE_BEAM );

  BeamJobConfig jobConfig = new BeamJobConfig();
  jobConfig.setName("Direct runner test");
  jobConfig.setRunnerTypeName( RunnerType.Direct.name() );

  // No extra plugins to load : null option
  TransMetaPipelineConverter converter = new TransMetaPipelineConverter( transMeta, metaStore, (String) null, jobConfig );
  Pipeline pipeline = converter.createPipeline( pipelineOptions );

  PipelineResult pipelineResult = pipeline.run();
  pipelineResult.waitUntilFinish();

  MetricResults metricResults = pipelineResult.metrics();

  MetricQueryResults allResults = metricResults.queryMetrics( MetricsFilter.builder().build() );
  for ( MetricResult<Long> result : allResults.getCounters() ) {
    System.out.println( "Name: " + result.getName() + " Attempted: " + result.getAttempted() );
  }
}
 
Example 17
Source File: WindowedWordCount.java    From deployment-examples with MIT License 4 votes vote down vote up
static void runWindowedWordCount(Options options) throws IOException {
  final String output = options.getOutput();
  final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
  final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());

  Pipeline pipeline = Pipeline.create(options);

  /*
   * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
   * unbounded input source.
   */
  PCollection<String> input =
      pipeline
          /* Read from the GCS file. */
          .apply(TextIO.read().from(options.getInputFile()))
          // Concept #2: Add an element timestamp, using an artificial time just to show
          // windowing.
          // See AddTimestampFn for more detail on this.
          .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));

  /*
   * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
   * minute (you can change this with a command-line option). See the documentation for more
   * information on how fixed windows work, and for information on the other types of windowing
   * available (e.g., sliding windows).
   */
  PCollection<String> windowedWords =
      input.apply(
          Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));

  /*
   * Concept #4: Re-use our existing CountWords transform that does not have knowledge of
   * windows over a PCollection containing windowed values.
   */
  PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());

  /*
   * Concept #5: Format the results and write to a sharded file partitioned by window, using a
   * simple ParDo operation. Because there may be failures followed by retries, the
   * writes must be idempotent, but the details of writing to files is elided here.
   */
  wordCounts
      .apply(MapElements.via(new WordCount.FormatAsTextFn()))
      .apply(new WriteOneFilePerWindow(output, options.getNumShards()));

  PipelineResult result = pipeline.run();
  try {
    result.waitUntilFinish();
  } catch (Exception exc) {
    result.cancel();
  }
}
 
Example 18
Source File: BulkDecompressorTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link BulkDecompressor.Decompress} performs the decompression properly. */
@Test
public void testDecompressCompressedFile() throws Exception {
  // Arrange
  //
  final ValueProvider<String> outputDirectory =
      pipeline.newProvider(tempFolderOutputPath.toString());

  final Metadata compressedFile1Metadata =
      FileSystems.matchSingleFileSpec(compressedFile.toString());

  final Metadata compressedFile2Metadata =
      FileSystems.matchSingleFileSpec(wrongCompressionExtFile.toString());

  final String expectedOutputFilename = Files.getNameWithoutExtension(compressedFile.toString());

  final String expectedOutputFilePath =
      tempFolderOutputPath.resolve(expectedOutputFilename).normalize().toString();

  // Act
  //
  PCollectionTuple decompressOut =
      pipeline
          .apply("CreateWorkItems", Create.of(compressedFile1Metadata, compressedFile2Metadata))
          .apply(
              "Decompress",
              ParDo.of(new Decompress(outputDirectory))
                  .withOutputTags(DECOMPRESS_MAIN_OUT_TAG, TupleTagList.of(DEADLETTER_TAG)));

  // Assert
  //
  PAssert.that(decompressOut.get(DECOMPRESS_MAIN_OUT_TAG))
      .containsInAnyOrder(expectedOutputFilePath);

  PAssert.that(decompressOut.get(DEADLETTER_TAG))
      .satisfies(
          collection -> {
            KV<String, String> kv = collection.iterator().next();
            assertThat(kv.getKey(), is(equalTo(compressedFile2Metadata.resourceId().toString())));
            assertThat(kv.getValue(), is(notNullValue()));
            return null;
          });

  PipelineResult result = pipeline.run();
  result.waitUntilFinish();

  // Validate the uncompressed file written has the expected file content.
  PCollection<String> validatorOut =
      validatorPipeline.apply("ReadOutputFile", TextIO.read().from(expectedOutputFilePath));

  PAssert.that(validatorOut).containsInAnyOrder(FILE_CONTENT);

  validatorPipeline.run();
}
 
Example 19
Source File: CountReads.java    From dataflow-java with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws GeneralSecurityException, IOException, URISyntaxException {
  // Register the options so that they show up via --help
  PipelineOptionsFactory.register(Options.class);
  pipelineOptions = PipelineOptionsFactory.fromArgs(args)
      .withValidation().as(Options.class);
  // Option validation is not yet automatic, we make an explicit call here.
  Options.Methods.validateOptions(pipelineOptions);

  auth = GenomicsOptions.Methods.getGenomicsAuth(pipelineOptions);
  p = Pipeline.create(pipelineOptions);

  // ensure data is accessible
  String BAMFilePath = pipelineOptions.getBAMFilePath();
  if (!Strings.isNullOrEmpty(BAMFilePath)) {
    if (GCSURLExists(BAMFilePath)) {
      System.out.println(BAMFilePath + " is present, good.");
    } else {
      System.out.println("Error: " + BAMFilePath + " not found.");
      return;
    }
    if (pipelineOptions.isShardBAMReading()) {
      // the BAM code expects an index at BAMFilePath+".bai"
      // and sharded reading will fail if the index isn't there.
      String BAMIndexPath = BAMFilePath + ".bai";
      if (GCSURLExists(BAMIndexPath)) {
        System.out.println(BAMIndexPath + " is present, good.");
      } else {
        System.out.println("Error: " + BAMIndexPath + " not found.");
        return;
      }
    }
  }
  System.out.println("Output will be written to "+pipelineOptions.getOutput());

  PCollection<Read> reads = getReads();
  PCollection<Long> readCount = reads.apply(Count.<Read>globally());
  PCollection<String> readCountText = readCount.apply("toString", ParDo.of(new DoFn<Long, String>() {
    @ProcessElement
    public void processElement(DoFn<Long, String>.ProcessContext c) throws Exception {
      c.output(String.valueOf(c.element()));
    }
  }));
  readCountText.apply("WriteOutput", TextIO.write().to(pipelineOptions.getOutput()).withoutSharding());

  PipelineResult result = p.run();
  if(pipelineOptions.getWait()) {
    result.waitUntilFinish();
  }
}
 
Example 20
Source File: KettleBeamPipelineExecutor.java    From kettle-beam with Apache License 2.0 4 votes vote down vote up
private PipelineResult executePipeline() throws KettleException {
  ClassLoader oldContextClassLoader = Thread.currentThread().getContextClassLoader();
  try {
    // Explain to various classes in the Beam API (@see org.apache.beam.sdk.io.FileSystems)
    // what the context classloader is.
    // Set it back when we're done here.
    //
    Thread.currentThread().setContextClassLoader( classLoader );

    final Pipeline pipeline = getPipeline( transMeta, jobConfig );

    logChannel.logBasic( "Creation of Apache Beam pipeline is complete. Starting execution..." );

    // This next command can block on certain runners...
    //
    PipelineResult pipelineResult = asyncExecutePipeline(pipeline);

    Timer timer = new Timer();
    TimerTask timerTask = new TimerTask() {
      @Override public void run() {

        // Log the metrics...
        //
        if ( isLoggingMetrics() ) {
          logMetrics( pipelineResult );
        }

        // Update the listeners.
        //
        updateListeners( pipelineResult );
      }
    };
    // Every 5 seconds
    //
    timer.schedule( timerTask, 5000, 5000 );

    // Wait until we're done
    //
    pipelineResult.waitUntilFinish();

    timer.cancel();
    timer.purge();

    // Log the metrics at the end.
    logMetrics( pipelineResult );

    // Update a last time
    //
    updateListeners( pipelineResult );

    logChannel.logBasic( "  ----------------- End of Beam job " + pipeline.getOptions().getJobName() + " -----------------------" );

    return pipelineResult;
  } catch(Exception e) {
    throw new KettleException( "Error building/executing pipeline", e );
  } finally {
    Thread.currentThread().setContextClassLoader( oldContextClassLoader );
  }

}