org.apache.beam.runners.direct.DirectOptions Java Examples

The following examples show how to use org.apache.beam.runners.direct.DirectOptions. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PubsubReadIT.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadPublicData() throws Exception {
  // The pipeline will never terminate on its own
  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  PCollection<String> messages =
      pipeline.apply(
          PubsubIO.readStrings()
              .fromTopic("projects/pubsub-public-data/topics/taxirides-realtime"));

  messages.apply(
      "waitForAnyMessage", signal.signalSuccessWhen(messages.getCoder(), anyMessages -> true));

  Supplier<Void> start = signal.waitForStart(Duration.standardMinutes(5));
  pipeline.apply(signal.signalStart());
  PipelineResult job = pipeline.run();
  start.get();

  signal.waitForSuccess(Duration.standardSeconds(30));
  // A runner may not support cancel
  try {
    job.cancel();
  } catch (UnsupportedOperationException exc) {
    // noop
  }
}
 
Example #2
Source File: ElasticsearchDatasetRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties: ensure to read only the first batch of documents
    // from the index since we're computing a sample
    ElasticsearchInputRuntime inputRuntime = new ElasticsearchInputRuntime(true);
    ElasticsearchInputProperties inputProperties = new ElasticsearchInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(null, inputProperties);

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}
 
Example #3
Source File: BigQueryDatasetRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
public void getSampleDeprecated(int limit, Consumer<IndexedRecord> consumer) {
    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    // Create an input runtime based on the properties.
    BigQueryInputRuntime inputRuntime = new BigQueryInputRuntime();
    BigQueryInputProperties inputProperties = new BigQueryInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(new BeamJobRuntimeContainer(options), inputProperties);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        PipelineResult pr = p.run();
        pr.waitUntilFinish();
    }
}
 
Example #4
Source File: S3DatasetRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties.
    S3InputRuntime inputRuntime = new S3InputRuntime();
    S3InputProperties inputProperties = new S3InputProperties(null);
    inputProperties.limit.setValue(limit);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(null, inputProperties);
    // Create a pipeline using the input component to get records.

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(collector);
        p.run().waitUntilFinish();
    }
}
 
Example #5
Source File: PubsubIntegrationTest.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Test(timeout = 30000)
public void canReadPubsubInput() throws Exception {
  List<String> inputLines = Lines.resources("testdata/basic-messages-nonempty.ndjson");
  publishLines(inputLines);

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setInput(pipeline.newProvider(subscriptionName.toString()));

  PCollection<String> output = pipeline.apply(InputType.pubsub.read(sinkOptions))
      .apply("encodeJson", OutputFileFormat.json.encode());

  PAssert.that(output).containsInAnyOrder(inputLines);

  // This runs in the background and returns immediately due to setBlockOnRun above.
  PipelineResult result = pipeline.run();

  // The wait here is determined empirically; it's not entirely clear why it takes this long.
  System.err.println("Waiting 15 seconds to make sure we've processed all messages...");
  result.waitUntilFinish(Duration.millis(15000));
  System.err.println("Done waiting; now cancelling the pipeline so the test can finish.");
  result.cancel();
}
 
Example #6
Source File: PubsubIntegrationTest.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Test(timeout = 30000)
public void canSendPubsubOutput() throws Exception {
  final List<String> inputLines = Lines.resources("testdata/pubsub-integration/input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setOutput(pipeline.newProvider(topicName.toString()));
  // We would normally use pipeline.newProvider instead of StaticValueProvider in tests,
  // but something about this configuration causes the pipeline to stall when CompressPayload
  // accesses a method on the underlying enum value when defined via pipeline.newProvider.
  sinkOptions.setOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED));

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(OutputType.pubsub.write(sinkOptions));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/truncated.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}
 
Example #7
Source File: PubsubIntegrationTest.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Test(timeout = 30000)
public void canSendGzippedPayloads() throws Exception {
  final List<String> inputLines = Lines.resources("testdata/pubsub-integration/input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions sinkOptions = pipeline.getOptions().as(SinkOptions.class);
  sinkOptions.setOutputType(OutputType.pubsub);
  sinkOptions.setOutput(pipeline.newProvider(topicName.toString()));
  SinkOptions.Parsed options = SinkOptions.parseSinkOptions(sinkOptions);

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(options.getOutputType().write(options));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/gzipped.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}
 
Example #8
Source File: FhirIOReadIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testFhirIORead() throws Exception {
  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  FhirIO.Read.Result result =
      pipeline
          .apply(PubsubIO.readStrings().fromSubscription(pubsubSubscription))
          .apply(FhirIO.readResources());

  PCollection<String> resources = result.getResources();
  resources.apply(
      "waitForAnyMessage", signal.signalSuccessWhen(resources.getCoder(), anyResources -> true));
  // wait for any resource

  Supplier<Void> start = signal.waitForStart(Duration.standardMinutes(5));
  pipeline.apply(signal.signalStart());
  PipelineResult job = pipeline.run();
  start.get();
  signal.waitForSuccess(Duration.standardSeconds(30));

  // A runner may not support cancel
  try {
    job.cancel();
  } catch (UnsupportedOperationException exc) {
    // noop
  }
}
 
Example #9
Source File: KinesisDatasetRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties.
    KinesisInputRuntime inputRuntime = new KinesisInputRuntime();
    KinesisInputProperties inputProperties = new KinesisInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputProperties.useMaxNumRecords.setValue(true);
    inputProperties.maxNumRecords.setValue(limit);
    inputProperties.useMaxReadTime.setValue(true);
    inputProperties.maxReadTime.setValue(10000l);
    inputProperties.position.setValue(KinesisInputProperties.OffsetType.EARLIEST);
    inputRuntime.initialize(null, inputProperties);

    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        p.run().waitUntilFinish();
    }
}
 
Example #10
Source File: BeamDirectTestResource.java    From components with Apache License 2.0 5 votes vote down vote up
/**
 * @return the options used to create this pipeline. These can be or changed before the Pipeline is created.
 */
public DirectOptions getOptions() {
    if (options == null) {
        options = PipelineOptionsFactory.create().as(DirectOptions.class);
        options.setRunner(DirectRunner.class);
    }
    return options;
}
 
Example #11
Source File: PubsubIntegrationTest.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Test(timeout = 30000)
public void canSendPubsubErrorOutput() throws Exception {
  final List<String> inputLines = Lines
      .resources("testdata/pubsub-integration/error-input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setInput(pipeline.newProvider("test input"));
  sinkOptions.setJobName("test job name");
  sinkOptions.setErrorOutput(pipeline.newProvider(topicName.toString()));
  // We would normally use pipeline.newProvider instead of StaticValueProvider in tests,
  // but something about this configuration causes the pipeline to stall when CompressPayload
  // accesses a method on the underlying enum value when defined via pipeline.newProvider.
  sinkOptions.setErrorOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED));

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(ErrorOutputType.pubsub.write(sinkOptions));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/error-output.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}
 
Example #12
Source File: SimpleFileIODatasetRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties.
    SimpleFileIOInputRuntime inputRuntime = new SimpleFileIOInputRuntime();
    SimpleFileIOInputProperties inputProperties = new SimpleFileIOInputProperties(null);
    inputProperties.limit.setValue(limit);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(null, inputProperties);
    // Create a pipeline using the input component to get records.

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(collector);
        try {
            p.run().waitUntilFinish();
        } catch (Pipeline.PipelineExecutionException e) {
            if (e.getCause() instanceof TalendRuntimeException)
                throw (TalendRuntimeException) e.getCause();
            throw e;
        }
    }
}
 
Example #13
Source File: PubSubDatasetRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Because PubSub do not have offset, and the message will be deleted after
    // read, so have to create a dumy reader which do not call ack after read

    // Create an input runtime based on the properties.
    PubSubInputRuntime inputRuntime = new PubSubInputRuntime();
    PubSubInputProperties inputProperties = new PubSubInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputProperties.useMaxNumRecords.setValue(true);
    inputProperties.maxNumRecords.setValue(limit);
    inputProperties.useMaxReadTime.setValue(true);
    // 10s, the value is better to depends on ack deadline for small dataset
    inputProperties.maxReadTime.setValue(10000l);
    inputProperties.noACK.setValue(true);
    inputRuntime.initialize(null, inputProperties);

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}
 
Example #14
Source File: BeamLocalRunnerOption.java    From components with Apache License 2.0 5 votes vote down vote up
public static DirectOptions getOptions() {
    if (options == null) {
        LOGGER.info("Create DirectOption");
        options = PipelineOptionsFactory.as(DirectOptions.class);
        options.setTargetParallelism(1);
        options.setRunner(DirectRunner.class);
        options.setEnforceEncodability(false);
        options.setEnforceImmutability(false);
    }
    return options;
}
 
Example #15
Source File: PubsubReadIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadPubsubMessageId() throws Exception {
  // The pipeline will never terminate on its own
  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  PCollection<PubsubMessage> messages =
      pipeline.apply(
          PubsubIO.readMessagesWithAttributesAndMessageId()
              .fromTopic("projects/pubsub-public-data/topics/taxirides-realtime"));

  messages.apply(
      "isMessageIdNonNull",
      signal.signalSuccessWhen(messages.getCoder(), new NonEmptyMessageIdCheck()));

  Supplier<Void> start = signal.waitForStart(Duration.standardMinutes(5));
  pipeline.apply(signal.signalStart());
  PipelineResult job = pipeline.run();
  start.get();

  signal.waitForSuccess(Duration.standardMinutes(1));
  // A runner may not support cancel
  try {
    job.cancel();
  } catch (UnsupportedOperationException exc) {
    // noop
  }
}
 
Example #16
Source File: BeamDirectPipelineRunConfiguration.java    From hop with Apache License 2.0 5 votes vote down vote up
@Override public PipelineOptions getPipelineOptions() {
  DirectOptions options = PipelineOptionsFactory.as( DirectOptions.class );
  options.setBlockOnRun( !isRunningAsynchronous() );
  if ( StringUtils.isNotEmpty(numberOfWorkers)) {
    int targetParallelism = Const.toInt(environmentSubstitute( numberOfWorkers),  1);
    options.setTargetParallelism(targetParallelism);
  }

  return options;
}
 
Example #17
Source File: BeamEnumerableConverter.java    From beam with Apache License 2.0 5 votes vote down vote up
private static PipelineResult limitRun(
    PipelineOptions options,
    BeamRelNode node,
    DoFn<Row, Void> doFn,
    Queue<Row> values,
    int limitCount) {
  options.as(DirectOptions.class).setBlockOnRun(false);
  Pipeline pipeline = Pipeline.create(options);
  PCollection<Row> resultCollection = BeamSqlRelUtils.toPCollection(pipeline, node);
  resultCollection.apply(ParDo.of(doFn));

  PipelineResult result = pipeline.run();

  State state;
  while (true) {
    // Check pipeline state in every second
    state = result.waitUntilFinish(Duration.standardSeconds(1));
    if (state != null && state.isTerminal()) {
      if (PipelineResult.State.FAILED.equals(state)) {
        throw new RuntimeException("Pipeline failed for unknown reason");
      }
      break;
    }

    try {
      if (values.size() >= limitCount) {
        result.cancel();
        break;
      }
    } catch (IOException e) {
      LOG.warn(e.toString());
      break;
    }
  }

  return result;
}
 
Example #18
Source File: DataCatalogGCSIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadFromGCS() throws Exception {
  String gcsEntryId =
      "`datacatalog`" // this is part of the resource name in DataCatalog, so it has to be
          + ".`entry`" // different from the table provider name ("dc" in this test)
          + ".`apache-beam-testing`"
          + ".`us-central1`"
          + ".`samples`"
          + ".`integ_test_small_csv_test_1`";

  try (DataCatalogTableProvider tableProvider =
      DataCatalogTableProvider.create(
          pipeline.getOptions().as(DataCatalogPipelineOptions.class))) {
    PCollection<Row> result =
        pipeline.apply(
            "query",
            SqlTransform.query("SELECT id, name, type FROM " + gcsEntryId)
                .withDefaultTableProvider("dc", tableProvider));

    pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(true);
    PAssert.that(result)
        .containsInAnyOrder(
            row(1, "customer1", "test"),
            row(2, "customer2", "test"),
            row(3, "customer1", "test"),
            row(4, "customer2", "test"));
    pipeline.run().waitUntilFinish(Duration.standardMinutes(2));
  }
}
 
Example #19
Source File: JdbcAvroJob.java    From dbeam with Apache License 2.0 5 votes vote down vote up
public static JdbcAvroJob create(final PipelineOptions pipelineOptions, final String output)
    throws IOException, ClassNotFoundException {
  // make sure pipeline.run() does not call waitUntilFinish
  // instead we call with an explicit duration/exportTimeout configuration
  pipelineOptions.as(DirectOptions.class).setBlockOnRun(false);
  return new JdbcAvroJob(
      pipelineOptions,
      Pipeline.create(pipelineOptions),
      JdbcExportArgsFactory.fromPipelineOptions(pipelineOptions),
      output);
}
 
Example #20
Source File: KafkaCSVTableIT.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testFake() throws InterruptedException {
  KafkaOptions kafkaOptions = pipeline.getOptions().as(KafkaOptions.class);
  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);
  String createTableString =
      "CREATE EXTERNAL TABLE kafka_table(\n"
          + "order_id INTEGER, \n"
          + "member_id INTEGER, \n"
          + "item_name INTEGER \n"
          + ") \n"
          + "TYPE 'kafka' \n"
          + "LOCATION '"
          + "'\n"
          + "TBLPROPERTIES '"
          + getKafkaPropertiesString(kafkaOptions)
          + "'";
  TableProvider tb = new KafkaTableProvider();
  BeamSqlEnv env = BeamSqlEnv.inMemory(tb);

  env.executeDdl(createTableString);

  PCollection<Row> queryOutput =
      BeamSqlRelUtils.toPCollection(pipeline, env.parseQuery("SELECT * FROM kafka_table"));

  queryOutput
      .apply(ParDo.of(new FakeKvPair()))
      .apply(
          "waitForSuccess",
          ParDo.of(
              new StreamAssertEqual(
                  ImmutableSet.of(
                      row(TEST_TABLE_SCHEMA, 0, 1, 0),
                      row(TEST_TABLE_SCHEMA, 1, 2, 1),
                      row(TEST_TABLE_SCHEMA, 2, 3, 2)))));
  queryOutput.apply(logRecords(""));
  pipeline.run();
  TimeUnit.MILLISECONDS.sleep(3000);
  produceSomeRecords(3);

  for (int i = 0; i < 200; i++) {
    if (FLAG.getOrDefault(pipeline.getOptions().getOptionsId(), false)) {
      return;
    }
    TimeUnit.MILLISECONDS.sleep(60);
  }
  Assert.fail();
}
 
Example #21
Source File: FeatureSetSpecReadAndWriteTest.java    From feast with Apache License 2.0 4 votes vote down vote up
public static PipelineOptions makePipelineOptions() {
  DirectOptions options = PipelineOptionsFactory.as(DirectOptions.class);
  options.setJobName("test_job");
  options.setBlockOnRun(false);
  return options;
}