Java Code Examples for org.apache.beam.sdk.options.PipelineOptions#as()

The following examples show how to use org.apache.beam.sdk.options.PipelineOptions#as() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: JdbcExportArgsFactory.java From dbeam with Apache License 2.0

6 votes

public static JdbcExportArgs fromPipelineOptions(final PipelineOptions options)
    throws ClassNotFoundException, IOException {
  final JdbcExportPipelineOptions exportOptions = options.as(JdbcExportPipelineOptions.class);
  final JdbcAvroArgs jdbcAvroArgs =
      JdbcAvroArgs.create(
          JdbcConnectionArgs.create(exportOptions.getConnectionUrl())
              .withUsername(exportOptions.getUsername())
              .withPassword(PasswordReader.INSTANCE.readPassword(exportOptions).orElse(null)),
          exportOptions.getFetchSize(),
          exportOptions.getAvroCodec(),
          Optional.ofNullable(exportOptions.getPreCommand()).orElse(Collections.emptyList()));

  return JdbcExportArgs.create(
      jdbcAvroArgs,
      createQueryArgs(exportOptions),
      exportOptions.getAvroSchemaNamespace(),
      Optional.ofNullable(exportOptions.getAvroDoc()),
      exportOptions.isUseAvroLogicalTypes(),
      Duration.parse(exportOptions.getExportTimeout()),
      BeamJdbcAvroSchema.parseOptionalInputAvroSchemaFile(exportOptions.getAvroSchemaFilePath()));
}

Example 2

Source File: BigQueryOutputRuntime.java From components with Apache License 2.0

6 votes

@Override
public ValidationResult initialize(RuntimeContainer container, BigQueryOutputProperties properties) {
    this.properties = properties;
    this.dataset = properties.getDatasetProperties();
    this.datastore = dataset.getDatastoreProperties();

    Object pipelineOptionsObj = container.getGlobalData(BeamJobRuntimeContainer.PIPELINE_OPTIONS);
    if (pipelineOptionsObj != null) {
        PipelineOptions pipelineOptions = (PipelineOptions) pipelineOptionsObj;
        GcpServiceAccountOptions gcpOptions = pipelineOptions.as(GcpServiceAccountOptions.class);
        if (!"DataflowRunner".equals(gcpOptions.getRunner().getSimpleName())) {
            // when using Dataflow runner, these properties has been set on pipeline level
            gcpOptions.setProject(datastore.projectName.getValue());
            gcpOptions.setTempLocation(datastore.tempGsFolder.getValue());
            gcpOptions.setCredentialFactoryClass(ServiceAccountCredentialFactory.class);
            gcpOptions.setServiceAccountFile(datastore.serviceAccountFile.getValue());
            gcpOptions.setGcpCredential(BigQueryConnection.createCredentials(datastore));
        }
    }

    return ValidationResult.OK;
}

Example 3

Source File: PubSubInputRuntime.java From components with Apache License 2.0

6 votes

@Override
public ValidationResult initialize(RuntimeContainer container, PubSubInputProperties properties) {
    this.properties = properties;
    this.dataset = properties.getDatasetProperties();
    this.datastore = dataset.getDatastoreProperties();

    if (container != null) {
        Object pipelineOptionsObj = container.getGlobalData(BeamJobRuntimeContainer.PIPELINE_OPTIONS);
        if (pipelineOptionsObj != null) {
            PipelineOptions pipelineOptions = (PipelineOptions) pipelineOptionsObj;
            GcpServiceAccountOptions gcpOptions = pipelineOptions.as(GcpServiceAccountOptions.class);
            runOnDataflow = "DataflowRunner".equals(gcpOptions.getRunner().getSimpleName());
            if (!runOnDataflow) {
                gcpOptions.setProject(datastore.projectName.getValue());
                if (datastore.serviceAccountFile.getValue() != null) {
                    gcpOptions.setCredentialFactoryClass(ServiceAccountCredentialFactory.class);
                    gcpOptions.setServiceAccountFile(datastore.serviceAccountFile.getValue());
                    gcpOptions.setGcpCredential(PubSubConnection.createCredentials(datastore));
                }
            }
        }
    }
    return ValidationResult.OK;
}

Example 4

Source File: BigQueryIO.java From beam with Apache License 2.0

6 votes

@Override
public void validate(PipelineOptions pipelineOptions) {
  BigQueryOptions options = pipelineOptions.as(BigQueryOptions.class);

  // The user specified a table.
  if (getJsonTableRef() != null && getJsonTableRef().isAccessible() && getValidate()) {
    TableReference table = getTableWithDefaultProject(options).get();
    DatasetService datasetService = getBigQueryServices().getDatasetService(options);
    // Check for destination table presence and emptiness for early failure notification.
    // Note that a presence check can fail when the table or dataset is created by an earlier
    // stage of the pipeline. For these cases the #withoutValidation method can be used to
    // disable the check.
    BigQueryHelpers.verifyDatasetPresence(datasetService, table);
    if (getCreateDisposition() == BigQueryIO.Write.CreateDisposition.CREATE_NEVER) {
      BigQueryHelpers.verifyTablePresence(datasetService, table);
    }
    if (getWriteDisposition() == BigQueryIO.Write.WriteDisposition.WRITE_EMPTY) {
      BigQueryHelpers.verifyTableNotExistOrEmpty(datasetService, table);
    }
  }
}

Example 5

Source File: AbstractOperatorTest.java From beam with Apache License 2.0

6 votes

/**
 * Run all tests with given runner.
 *
 * @param tc the test case to executeSync
 */
@SuppressWarnings("unchecked")
public <T> void execute(TestCase<T> tc) {

  final SingleJvmAccumulatorProvider.Factory accumulatorProvider =
      SingleJvmAccumulatorProvider.Factory.get();
  final PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
  final EuphoriaOptions euphoriaOptions = pipelineOptions.as(EuphoriaOptions.class);
  euphoriaOptions.setAccumulatorProviderFactory(accumulatorProvider);
  final Pipeline pipeline = TestPipeline.create(pipelineOptions);
  pipeline.getCoderRegistry().registerCoderForClass(Object.class, KryoCoder.of(pipelineOptions));
  final PCollection<T> output = tc.getOutput(pipeline);
  tc.validate(output);
  pipeline.run().waitUntilFinish();
  tc.validateAccumulators(accumulatorProvider);
}

Example 6

Source File: BigQueryIOReadTest.java From beam with Apache License 2.0

5 votes

@Test
public void testBigQueryQuerySourceEstimatedSize() throws Exception {

  String queryString = "fake query string";

  PipelineOptions options = PipelineOptionsFactory.create();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");
  String stepUuid = "testStepUuid";

  BigQuerySourceBase<TableRow> bqSource =
      BigQueryQuerySourceDef.create(
              fakeBqServices,
              ValueProvider.StaticValueProvider.of(queryString),
              true, /* flattenResults */
              true, /* useLegacySql */
              QueryPriority.BATCH,
              null,
              null,
              null)
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  fakeJobService.expectDryRunQuery(
      bqOptions.getProject(),
      queryString,
      new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L)));

  assertEquals(100, bqSource.getEstimatedSizeBytes(bqOptions));
}

Example 7

Source File: UnboundedSourceWrapper.java From beam with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public UnboundedSourceWrapper(
    String stepName,
    PipelineOptions pipelineOptions,
    UnboundedSource<OutputT, CheckpointMarkT> source,
    int parallelism)
    throws Exception {
  this.stepName = stepName;
  this.serializedOptions = new SerializablePipelineOptions(pipelineOptions);
  this.isConvertedBoundedSource =
      source instanceof UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter;

  if (source.requiresDeduping()) {
    LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source);
  }

  Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder();
  if (checkpointMarkCoder == null) {
    LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots.");
    checkpointCoder = null;
  } else {

    Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder =
        (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {});

    checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder);
  }

  // get the splits early. we assume that the generated splits are stable,
  // this is necessary so that the mapping of state to source is correct
  // when restoring
  splitSources = source.split(parallelism, pipelineOptions);

  FlinkPipelineOptions options = pipelineOptions.as(FlinkPipelineOptions.class);
  idleTimeoutMs = options.getShutdownSourcesAfterIdleMs();
}

Example 8

Source File: DataflowPipelineDebugOptions.java From beam with Apache License 2.0

5 votes

@Override
public Stager create(PipelineOptions options) {
  DataflowPipelineDebugOptions debugOptions = options.as(DataflowPipelineDebugOptions.class);
  return InstanceBuilder.ofType(Stager.class)
      .fromClass(debugOptions.getStagerClass())
      .fromFactoryMethod("fromOptions")
      .withArg(PipelineOptions.class, options)
      .build();
}

Example 9

Source File: BigQueryInputRuntime.java From components with Apache License 2.0

5 votes

@Override
public ValidationResult initialize(RuntimeContainer container, BigQueryInputProperties properties) {
    this.properties = properties;
    this.dataset = properties.getDatasetProperties();
    this.datastore = dataset.getDatastoreProperties();

    // Data returned by BigQueryIO do not contains self schema, so have to retrieve it before read and write
    // operations
    Schema schema = properties.getDatasetProperties().main.schema.getValue();
    if (schema == null || AvroUtils.isSchemaEmpty(schema) || AvroUtils.isIncludeAllFields(schema)) {
        BigQueryDatasetRuntime schemaFetcher = new BigQueryDatasetRuntime();
        schemaFetcher.initialize(container, properties.getDatasetProperties());
        schema = schemaFetcher.getSchema();
    }

    Object pipelineOptionsObj = container.getGlobalData(BeamJobRuntimeContainer.PIPELINE_OPTIONS);
    if (pipelineOptionsObj != null) {
        PipelineOptions pipelineOptions = (PipelineOptions) pipelineOptionsObj;
        GcpServiceAccountOptions gcpOptions = pipelineOptions.as(GcpServiceAccountOptions.class);
        if (!"DataflowRunner".equals(gcpOptions.getRunner().getSimpleName())) {
            // when using Dataflow runner, these properties has been set on pipeline level
            gcpOptions.setProject(datastore.projectName.getValue());
            gcpOptions.setTempLocation(datastore.tempGsFolder.getValue());
            gcpOptions.setCredentialFactoryClass(ServiceAccountCredentialFactory.class);
            gcpOptions.setServiceAccountFile(datastore.serviceAccountFile.getValue());
            gcpOptions.setGcpCredential(BigQueryConnection.createCredentials(datastore));
        }
    }

    this.defaultOutputCoder = AvroCoder.of(schema);

    return ValidationResult.OK;
}

Example 10

Source File: ReaderInvocationUtil.java From beam with Apache License 2.0

5 votes

public ReaderInvocationUtil(
    String stepName, PipelineOptions options, FlinkMetricContainer container) {
  FlinkPipelineOptions flinkPipelineOptions = options.as(FlinkPipelineOptions.class);
  this.stepName = stepName;
  this.enableMetrics = !flinkPipelineOptions.getDisableMetrics();
  this.container = container;
}

Example 11

Source File: StreamingDataflowWorkerOptions.java From beam with Apache License 2.0

4 votes

@Override
public Integer create(PipelineOptions options) {
  StreamingDataflowWorkerOptions streamingOptions =
      options.as(StreamingDataflowWorkerOptions.class);
  return streamingOptions.isEnableStreamingEngine() ? Integer.MAX_VALUE : 1;
}

Example 12

Source File: FlinkRunnerTest.java From beam with Apache License 2.0

4 votes

@SuppressWarnings("unused")
public static NotExecutingFlinkRunner fromOptions(PipelineOptions options) {
  return new NotExecutingFlinkRunner(options.as(FlinkPipelineOptions.class));
}

Example 13

Source File: FlinkRunnerTest.java From beam with Apache License 2.0

4 votes

@SuppressWarnings("unused")
public static NotExecutingFlinkRunner fromOptions(PipelineOptions options) {
  return new NotExecutingFlinkRunner(options.as(FlinkPipelineOptions.class));
}

Example 14

Source File: TestFlinkRunner.java From beam with Apache License 2.0

4 votes

public static TestFlinkRunner fromOptions(PipelineOptions options) {
  FlinkPipelineOptions flinkOptions = options.as(FlinkPipelineOptions.class);
  return new TestFlinkRunner(flinkOptions);
}

Example 15

Source File: AbstractParDoP.java From beam with Apache License 2.0

4 votes

private static Boolean isCooperativenessAllowed(
    SerializablePipelineOptions serializablePipelineOptions) {
  PipelineOptions pipelineOptions = serializablePipelineOptions.get();
  JetPipelineOptions jetPipelineOptions = pipelineOptions.as(JetPipelineOptions.class);
  return jetPipelineOptions.getJetProcessorsCooperative();
}

Example 16

Source File: DoFnOperator.java From beam with Apache License 2.0

4 votes

/** Constructor for DoFnOperator. */
public DoFnOperator(
    DoFn<InputT, OutputT> doFn,
    String stepName,
    Coder<WindowedValue<InputT>> inputWindowedCoder,
    Map<TupleTag<?>, Coder<?>> outputCoders,
    TupleTag<OutputT> mainOutputTag,
    List<TupleTag<?>> additionalOutputTags,
    OutputManagerFactory<OutputT> outputManagerFactory,
    WindowingStrategy<?, ?> windowingStrategy,
    Map<Integer, PCollectionView<?>> sideInputTagMapping,
    Collection<PCollectionView<?>> sideInputs,
    PipelineOptions options,
    Coder<?> keyCoder,
    KeySelector<WindowedValue<InputT>, ?> keySelector,
    DoFnSchemaInformation doFnSchemaInformation,
    Map<String, PCollectionView<?>> sideInputMapping) {
  this.doFn = doFn;
  this.stepName = stepName;
  this.windowedInputCoder = inputWindowedCoder;
  this.outputCoders = outputCoders;
  this.mainOutputTag = mainOutputTag;
  this.additionalOutputTags = additionalOutputTags;
  this.sideInputTagMapping = sideInputTagMapping;
  this.sideInputs = sideInputs;
  this.serializedOptions = new SerializablePipelineOptions(options);
  this.windowingStrategy = windowingStrategy;
  this.outputManagerFactory = outputManagerFactory;

  setChainingStrategy(ChainingStrategy.ALWAYS);

  this.keyCoder = keyCoder;
  this.keySelector = keySelector;

  this.timerCoder =
      TimerInternals.TimerDataCoderV2.of(windowingStrategy.getWindowFn().windowCoder());

  FlinkPipelineOptions flinkOptions = options.as(FlinkPipelineOptions.class);

  this.maxBundleSize = flinkOptions.getMaxBundleSize();
  Preconditions.checkArgument(maxBundleSize > 0, "Bundle size must be at least 1");
  this.maxBundleTimeMills = flinkOptions.getMaxBundleTimeMills();
  Preconditions.checkArgument(maxBundleTimeMills > 0, "Bundle time must be at least 1");
  this.doFnSchemaInformation = doFnSchemaInformation;
  this.sideInputMapping = sideInputMapping;

  this.requiresStableInput =
      // WindowDoFnOperator does not use a DoFn
      doFn != null
          && DoFnSignatures.getSignature(doFn.getClass()).processElement().requiresStableInput();

  if (requiresStableInput) {
    Preconditions.checkState(
        CheckpointingMode.valueOf(flinkOptions.getCheckpointingMode())
            == CheckpointingMode.EXACTLY_ONCE,
        "Checkpointing mode is not set to exactly once but @RequiresStableInput is used.");
    Preconditions.checkState(
        flinkOptions.getCheckpointingInterval() > 0,
        "No checkpointing configured but pipeline uses @RequiresStableInput");
    LOG.warn(
        "Enabling stable input for transform {}. Will only process elements at most every {} milliseconds.",
        stepName,
        flinkOptions.getCheckpointingInterval()
            + Math.max(0, flinkOptions.getMinPauseBetweenCheckpoints()));
  }

  this.finishBundleBeforeCheckpointing = flinkOptions.getFinishBundleBeforeCheckpointing();
}

Example 17

Source File: DataflowRunner.java From beam with Apache License 2.0

4 votes

StreamingShardedWriteFactory(PipelineOptions options) {
  this.options = options.as(DataflowPipelineWorkerPoolOptions.class);
}

Example 18

Source File: BigQueryStorageSourceBase.java From beam with Apache License 2.0

4 votes

@Override
public List<BigQueryStorageStreamSource<T>> split(
    long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  Table targetTable = getTargetTable(bqOptions);
  int streamCount = 0;
  if (desiredBundleSizeBytes > 0) {
    long tableSizeBytes = (targetTable != null) ? targetTable.getNumBytes() : 0;
    streamCount = (int) Math.min(tableSizeBytes / desiredBundleSizeBytes, MAX_SPLIT_COUNT);
  }

  streamCount = Math.max(streamCount, MIN_SPLIT_COUNT);

  CreateReadSessionRequest.Builder requestBuilder =
      CreateReadSessionRequest.newBuilder()
          .setParent("projects/" + bqOptions.getProject())
          .setTableReference(BigQueryHelpers.toTableRefProto(targetTable.getTableReference()))
          .setRequestedStreams(streamCount)
          .setShardingStrategy(ShardingStrategy.BALANCED);

  if (selectedFieldsProvider != null || rowRestrictionProvider != null) {
    TableReadOptions.Builder builder = TableReadOptions.newBuilder();
    if (selectedFieldsProvider != null) {
      builder.addAllSelectedFields(selectedFieldsProvider.get());
    }
    if (rowRestrictionProvider != null) {
      builder.setRowRestriction(rowRestrictionProvider.get());
    }
    requestBuilder.setReadOptions(builder);
  } else if (tableReadOptions != null) {
    requestBuilder.setReadOptions(tableReadOptions);
  }

  ReadSession readSession;
  try (StorageClient client = bqServices.getStorageClient(bqOptions)) {
    CreateReadSessionRequest request = requestBuilder.build();
    readSession = client.createReadSession(request);
    LOG.info(
        "Sent BigQuery Storage API CreateReadSession request '{}'; received response '{}'.",
        request,
        readSession);
  }

  if (readSession.getStreamsList().isEmpty()) {
    // The underlying table is empty or all rows have been pruned.
    return ImmutableList.of();
  }

  Schema sessionSchema = new Schema.Parser().parse(readSession.getAvroSchema().getSchema());
  TableSchema trimmedSchema =
      BigQueryAvroUtils.trimBigQueryTableSchema(targetTable.getSchema(), sessionSchema);
  List<BigQueryStorageStreamSource<T>> sources = Lists.newArrayList();
  for (Stream stream : readSession.getStreamsList()) {
    sources.add(
        BigQueryStorageStreamSource.create(
            readSession, stream, trimmedSchema, parseFn, outputCoder, bqServices));
  }

  return ImmutableList.copyOf(sources);
}

Example 19

Source File: BigQueryIOReadTest.java From beam with Apache License 2.0

4 votes

@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
  List<TableRow> expected =
      ImmutableList.of(
          new TableRow().set("name", "a").set("number", 1L),
          new TableRow().set("name", "b").set("number", 2L),
          new TableRow().set("name", "c").set("number", 3L),
          new TableRow().set("name", "d").set("number", 4L),
          new TableRow().set("name", "e").set("number", 5L),
          new TableRow().set("name", "f").set("number", 6L));

  TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
  fakeDatasetService.createDataset("project", "data_set", "", "", null);
  fakeDatasetService.createTable(
      new Table()
          .setTableReference(table)
          .setSchema(
              new TableSchema()
                  .setFields(
                      ImmutableList.of(
                          new TableFieldSchema().setName("name").setType("STRING"),
                          new TableFieldSchema().setName("number").setType("INTEGER")))));
  fakeDatasetService.insertAll(table, expected, null);

  String stepUuid = "testStepUuid";
  BoundedSource<TableRow> bqSource =
      BigQueryTableSourceDef.create(fakeBqServices, ValueProvider.StaticValueProvider.of(table))
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  PipelineOptions options = PipelineOptionsFactory.create();
  options.setTempLocation(testFolder.getRoot().getAbsolutePath());
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");

  List<TableRow> read =
      convertStringsToLong(
          SourceTestUtils.readFromSplitsOfSource(bqSource, 0L /* ignored */, options));
  assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));

  List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
  assertEquals(2, sources.size());
  // Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
  sources = bqSource.split(200, options);
  assertEquals(2, sources.size());

  // A repeated call to split() should not have caused a duplicate extract job.
  assertEquals(1, fakeJobService.getNumExtractJobCalls());
}

Example 20

Source File: KryoCoderProvider.java From beam with Apache License 2.0

2 votes

/**
 * Create a new {@link KryoCoderProvider}.
 *
 * @param pipelineOptions Options used for coder setup. See {@link KryoOptions} for more details.
 * @param registrars {@link KryoRegistrar}s which are used to register classes with underlying
 *     kryo instance
 * @return A newly created {@link KryoCoderProvider}
 */
public static KryoCoderProvider of(
    PipelineOptions pipelineOptions, List<KryoRegistrar> registrars) {
  final KryoOptions kryoOptions = pipelineOptions.as(KryoOptions.class);
  return new KryoCoderProvider(KryoCoder.of(kryoOptions, registrars));
}