Java Code Examples for org.apache.beam.sdk.options.PipelineOptions#as()

The following examples show how to use org.apache.beam.sdk.options.PipelineOptions#as() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JdbcExportArgsFactory.java    From dbeam with Apache License 2.0 6 votes vote down vote up
public static JdbcExportArgs fromPipelineOptions(final PipelineOptions options)
    throws ClassNotFoundException, IOException {
  final JdbcExportPipelineOptions exportOptions = options.as(JdbcExportPipelineOptions.class);
  final JdbcAvroArgs jdbcAvroArgs =
      JdbcAvroArgs.create(
          JdbcConnectionArgs.create(exportOptions.getConnectionUrl())
              .withUsername(exportOptions.getUsername())
              .withPassword(PasswordReader.INSTANCE.readPassword(exportOptions).orElse(null)),
          exportOptions.getFetchSize(),
          exportOptions.getAvroCodec(),
          Optional.ofNullable(exportOptions.getPreCommand()).orElse(Collections.emptyList()));

  return JdbcExportArgs.create(
      jdbcAvroArgs,
      createQueryArgs(exportOptions),
      exportOptions.getAvroSchemaNamespace(),
      Optional.ofNullable(exportOptions.getAvroDoc()),
      exportOptions.isUseAvroLogicalTypes(),
      Duration.parse(exportOptions.getExportTimeout()),
      BeamJdbcAvroSchema.parseOptionalInputAvroSchemaFile(exportOptions.getAvroSchemaFilePath()));
}
 
Example 2
Source File: BigQueryOutputRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
public ValidationResult initialize(RuntimeContainer container, BigQueryOutputProperties properties) {
    this.properties = properties;
    this.dataset = properties.getDatasetProperties();
    this.datastore = dataset.getDatastoreProperties();

    Object pipelineOptionsObj = container.getGlobalData(BeamJobRuntimeContainer.PIPELINE_OPTIONS);
    if (pipelineOptionsObj != null) {
        PipelineOptions pipelineOptions = (PipelineOptions) pipelineOptionsObj;
        GcpServiceAccountOptions gcpOptions = pipelineOptions.as(GcpServiceAccountOptions.class);
        if (!"DataflowRunner".equals(gcpOptions.getRunner().getSimpleName())) {
            // when using Dataflow runner, these properties has been set on pipeline level
            gcpOptions.setProject(datastore.projectName.getValue());
            gcpOptions.setTempLocation(datastore.tempGsFolder.getValue());
            gcpOptions.setCredentialFactoryClass(ServiceAccountCredentialFactory.class);
            gcpOptions.setServiceAccountFile(datastore.serviceAccountFile.getValue());
            gcpOptions.setGcpCredential(BigQueryConnection.createCredentials(datastore));
        }
    }

    return ValidationResult.OK;
}
 
Example 3
Source File: PubSubInputRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
public ValidationResult initialize(RuntimeContainer container, PubSubInputProperties properties) {
    this.properties = properties;
    this.dataset = properties.getDatasetProperties();
    this.datastore = dataset.getDatastoreProperties();

    if (container != null) {
        Object pipelineOptionsObj = container.getGlobalData(BeamJobRuntimeContainer.PIPELINE_OPTIONS);
        if (pipelineOptionsObj != null) {
            PipelineOptions pipelineOptions = (PipelineOptions) pipelineOptionsObj;
            GcpServiceAccountOptions gcpOptions = pipelineOptions.as(GcpServiceAccountOptions.class);
            runOnDataflow = "DataflowRunner".equals(gcpOptions.getRunner().getSimpleName());
            if (!runOnDataflow) {
                gcpOptions.setProject(datastore.projectName.getValue());
                if (datastore.serviceAccountFile.getValue() != null) {
                    gcpOptions.setCredentialFactoryClass(ServiceAccountCredentialFactory.class);
                    gcpOptions.setServiceAccountFile(datastore.serviceAccountFile.getValue());
                    gcpOptions.setGcpCredential(PubSubConnection.createCredentials(datastore));
                }
            }
        }
    }
    return ValidationResult.OK;
}
 
Example 4
Source File: BigQueryIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public void validate(PipelineOptions pipelineOptions) {
  BigQueryOptions options = pipelineOptions.as(BigQueryOptions.class);

  // The user specified a table.
  if (getJsonTableRef() != null && getJsonTableRef().isAccessible() && getValidate()) {
    TableReference table = getTableWithDefaultProject(options).get();
    DatasetService datasetService = getBigQueryServices().getDatasetService(options);
    // Check for destination table presence and emptiness for early failure notification.
    // Note that a presence check can fail when the table or dataset is created by an earlier
    // stage of the pipeline. For these cases the #withoutValidation method can be used to
    // disable the check.
    BigQueryHelpers.verifyDatasetPresence(datasetService, table);
    if (getCreateDisposition() == BigQueryIO.Write.CreateDisposition.CREATE_NEVER) {
      BigQueryHelpers.verifyTablePresence(datasetService, table);
    }
    if (getWriteDisposition() == BigQueryIO.Write.WriteDisposition.WRITE_EMPTY) {
      BigQueryHelpers.verifyTableNotExistOrEmpty(datasetService, table);
    }
  }
}
 
Example 5
Source File: AbstractOperatorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Run all tests with given runner.
 *
 * @param tc the test case to executeSync
 */
@SuppressWarnings("unchecked")
public <T> void execute(TestCase<T> tc) {

  final SingleJvmAccumulatorProvider.Factory accumulatorProvider =
      SingleJvmAccumulatorProvider.Factory.get();
  final PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
  final EuphoriaOptions euphoriaOptions = pipelineOptions.as(EuphoriaOptions.class);
  euphoriaOptions.setAccumulatorProviderFactory(accumulatorProvider);
  final Pipeline pipeline = TestPipeline.create(pipelineOptions);
  pipeline.getCoderRegistry().registerCoderForClass(Object.class, KryoCoder.of(pipelineOptions));
  final PCollection<T> output = tc.getOutput(pipeline);
  tc.validate(output);
  pipeline.run().waitUntilFinish();
  tc.validateAccumulators(accumulatorProvider);
}
 
Example 6
Source File: BigQueryIOReadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testBigQueryQuerySourceEstimatedSize() throws Exception {

  String queryString = "fake query string";

  PipelineOptions options = PipelineOptionsFactory.create();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");
  String stepUuid = "testStepUuid";

  BigQuerySourceBase<TableRow> bqSource =
      BigQueryQuerySourceDef.create(
              fakeBqServices,
              ValueProvider.StaticValueProvider.of(queryString),
              true, /* flattenResults */
              true, /* useLegacySql */
              QueryPriority.BATCH,
              null,
              null,
              null)
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  fakeJobService.expectDryRunQuery(
      bqOptions.getProject(),
      queryString,
      new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L)));

  assertEquals(100, bqSource.getEstimatedSizeBytes(bqOptions));
}
 
Example 7
Source File: UnboundedSourceWrapper.java    From beam with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public UnboundedSourceWrapper(
    String stepName,
    PipelineOptions pipelineOptions,
    UnboundedSource<OutputT, CheckpointMarkT> source,
    int parallelism)
    throws Exception {
  this.stepName = stepName;
  this.serializedOptions = new SerializablePipelineOptions(pipelineOptions);
  this.isConvertedBoundedSource =
      source instanceof UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter;

  if (source.requiresDeduping()) {
    LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source);
  }

  Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder();
  if (checkpointMarkCoder == null) {
    LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots.");
    checkpointCoder = null;
  } else {

    Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder =
        (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {});

    checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder);
  }

  // get the splits early. we assume that the generated splits are stable,
  // this is necessary so that the mapping of state to source is correct
  // when restoring
  splitSources = source.split(parallelism, pipelineOptions);

  FlinkPipelineOptions options = pipelineOptions.as(FlinkPipelineOptions.class);
  idleTimeoutMs = options.getShutdownSourcesAfterIdleMs();
}
 
Example 8
Source File: DataflowPipelineDebugOptions.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public Stager create(PipelineOptions options) {
  DataflowPipelineDebugOptions debugOptions = options.as(DataflowPipelineDebugOptions.class);
  return InstanceBuilder.ofType(Stager.class)
      .fromClass(debugOptions.getStagerClass())
      .fromFactoryMethod("fromOptions")
      .withArg(PipelineOptions.class, options)
      .build();
}
 
Example 9
Source File: BigQueryInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public ValidationResult initialize(RuntimeContainer container, BigQueryInputProperties properties) {
    this.properties = properties;
    this.dataset = properties.getDatasetProperties();
    this.datastore = dataset.getDatastoreProperties();

    // Data returned by BigQueryIO do not contains self schema, so have to retrieve it before read and write
    // operations
    Schema schema = properties.getDatasetProperties().main.schema.getValue();
    if (schema == null || AvroUtils.isSchemaEmpty(schema) || AvroUtils.isIncludeAllFields(schema)) {
        BigQueryDatasetRuntime schemaFetcher = new BigQueryDatasetRuntime();
        schemaFetcher.initialize(container, properties.getDatasetProperties());
        schema = schemaFetcher.getSchema();
    }

    Object pipelineOptionsObj = container.getGlobalData(BeamJobRuntimeContainer.PIPELINE_OPTIONS);
    if (pipelineOptionsObj != null) {
        PipelineOptions pipelineOptions = (PipelineOptions) pipelineOptionsObj;
        GcpServiceAccountOptions gcpOptions = pipelineOptions.as(GcpServiceAccountOptions.class);
        if (!"DataflowRunner".equals(gcpOptions.getRunner().getSimpleName())) {
            // when using Dataflow runner, these properties has been set on pipeline level
            gcpOptions.setProject(datastore.projectName.getValue());
            gcpOptions.setTempLocation(datastore.tempGsFolder.getValue());
            gcpOptions.setCredentialFactoryClass(ServiceAccountCredentialFactory.class);
            gcpOptions.setServiceAccountFile(datastore.serviceAccountFile.getValue());
            gcpOptions.setGcpCredential(BigQueryConnection.createCredentials(datastore));
        }
    }

    this.defaultOutputCoder = AvroCoder.of(schema);

    return ValidationResult.OK;
}
 
Example 10
Source File: ReaderInvocationUtil.java    From beam with Apache License 2.0 5 votes vote down vote up
public ReaderInvocationUtil(
    String stepName, PipelineOptions options, FlinkMetricContainer container) {
  FlinkPipelineOptions flinkPipelineOptions = options.as(FlinkPipelineOptions.class);
  this.stepName = stepName;
  this.enableMetrics = !flinkPipelineOptions.getDisableMetrics();
  this.container = container;
}
 
Example 11
Source File: StreamingDataflowWorkerOptions.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public Integer create(PipelineOptions options) {
  StreamingDataflowWorkerOptions streamingOptions =
      options.as(StreamingDataflowWorkerOptions.class);
  return streamingOptions.isEnableStreamingEngine() ? Integer.MAX_VALUE : 1;
}
 
Example 12
Source File: FlinkRunnerTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unused")
public static NotExecutingFlinkRunner fromOptions(PipelineOptions options) {
  return new NotExecutingFlinkRunner(options.as(FlinkPipelineOptions.class));
}
 
Example 13
Source File: FlinkRunnerTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unused")
public static NotExecutingFlinkRunner fromOptions(PipelineOptions options) {
  return new NotExecutingFlinkRunner(options.as(FlinkPipelineOptions.class));
}
 
Example 14
Source File: TestFlinkRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
public static TestFlinkRunner fromOptions(PipelineOptions options) {
  FlinkPipelineOptions flinkOptions = options.as(FlinkPipelineOptions.class);
  return new TestFlinkRunner(flinkOptions);
}
 
Example 15
Source File: AbstractParDoP.java    From beam with Apache License 2.0 4 votes vote down vote up
private static Boolean isCooperativenessAllowed(
    SerializablePipelineOptions serializablePipelineOptions) {
  PipelineOptions pipelineOptions = serializablePipelineOptions.get();
  JetPipelineOptions jetPipelineOptions = pipelineOptions.as(JetPipelineOptions.class);
  return jetPipelineOptions.getJetProcessorsCooperative();
}
 
Example 16
Source File: DoFnOperator.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Constructor for DoFnOperator. */
public DoFnOperator(
    DoFn<InputT, OutputT> doFn,
    String stepName,
    Coder<WindowedValue<InputT>> inputWindowedCoder,
    Map<TupleTag<?>, Coder<?>> outputCoders,
    TupleTag<OutputT> mainOutputTag,
    List<TupleTag<?>> additionalOutputTags,
    OutputManagerFactory<OutputT> outputManagerFactory,
    WindowingStrategy<?, ?> windowingStrategy,
    Map<Integer, PCollectionView<?>> sideInputTagMapping,
    Collection<PCollectionView<?>> sideInputs,
    PipelineOptions options,
    Coder<?> keyCoder,
    KeySelector<WindowedValue<InputT>, ?> keySelector,
    DoFnSchemaInformation doFnSchemaInformation,
    Map<String, PCollectionView<?>> sideInputMapping) {
  this.doFn = doFn;
  this.stepName = stepName;
  this.windowedInputCoder = inputWindowedCoder;
  this.outputCoders = outputCoders;
  this.mainOutputTag = mainOutputTag;
  this.additionalOutputTags = additionalOutputTags;
  this.sideInputTagMapping = sideInputTagMapping;
  this.sideInputs = sideInputs;
  this.serializedOptions = new SerializablePipelineOptions(options);
  this.windowingStrategy = windowingStrategy;
  this.outputManagerFactory = outputManagerFactory;

  setChainingStrategy(ChainingStrategy.ALWAYS);

  this.keyCoder = keyCoder;
  this.keySelector = keySelector;

  this.timerCoder =
      TimerInternals.TimerDataCoderV2.of(windowingStrategy.getWindowFn().windowCoder());

  FlinkPipelineOptions flinkOptions = options.as(FlinkPipelineOptions.class);

  this.maxBundleSize = flinkOptions.getMaxBundleSize();
  Preconditions.checkArgument(maxBundleSize > 0, "Bundle size must be at least 1");
  this.maxBundleTimeMills = flinkOptions.getMaxBundleTimeMills();
  Preconditions.checkArgument(maxBundleTimeMills > 0, "Bundle time must be at least 1");
  this.doFnSchemaInformation = doFnSchemaInformation;
  this.sideInputMapping = sideInputMapping;

  this.requiresStableInput =
      // WindowDoFnOperator does not use a DoFn
      doFn != null
          && DoFnSignatures.getSignature(doFn.getClass()).processElement().requiresStableInput();

  if (requiresStableInput) {
    Preconditions.checkState(
        CheckpointingMode.valueOf(flinkOptions.getCheckpointingMode())
            == CheckpointingMode.EXACTLY_ONCE,
        "Checkpointing mode is not set to exactly once but @RequiresStableInput is used.");
    Preconditions.checkState(
        flinkOptions.getCheckpointingInterval() > 0,
        "No checkpointing configured but pipeline uses @RequiresStableInput");
    LOG.warn(
        "Enabling stable input for transform {}. Will only process elements at most every {} milliseconds.",
        stepName,
        flinkOptions.getCheckpointingInterval()
            + Math.max(0, flinkOptions.getMinPauseBetweenCheckpoints()));
  }

  this.finishBundleBeforeCheckpointing = flinkOptions.getFinishBundleBeforeCheckpointing();
}
 
Example 17
Source File: DataflowRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
StreamingShardedWriteFactory(PipelineOptions options) {
  this.options = options.as(DataflowPipelineWorkerPoolOptions.class);
}
 
Example 18
Source File: BigQueryStorageSourceBase.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public List<BigQueryStorageStreamSource<T>> split(
    long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  Table targetTable = getTargetTable(bqOptions);
  int streamCount = 0;
  if (desiredBundleSizeBytes > 0) {
    long tableSizeBytes = (targetTable != null) ? targetTable.getNumBytes() : 0;
    streamCount = (int) Math.min(tableSizeBytes / desiredBundleSizeBytes, MAX_SPLIT_COUNT);
  }

  streamCount = Math.max(streamCount, MIN_SPLIT_COUNT);

  CreateReadSessionRequest.Builder requestBuilder =
      CreateReadSessionRequest.newBuilder()
          .setParent("projects/" + bqOptions.getProject())
          .setTableReference(BigQueryHelpers.toTableRefProto(targetTable.getTableReference()))
          .setRequestedStreams(streamCount)
          .setShardingStrategy(ShardingStrategy.BALANCED);

  if (selectedFieldsProvider != null || rowRestrictionProvider != null) {
    TableReadOptions.Builder builder = TableReadOptions.newBuilder();
    if (selectedFieldsProvider != null) {
      builder.addAllSelectedFields(selectedFieldsProvider.get());
    }
    if (rowRestrictionProvider != null) {
      builder.setRowRestriction(rowRestrictionProvider.get());
    }
    requestBuilder.setReadOptions(builder);
  } else if (tableReadOptions != null) {
    requestBuilder.setReadOptions(tableReadOptions);
  }

  ReadSession readSession;
  try (StorageClient client = bqServices.getStorageClient(bqOptions)) {
    CreateReadSessionRequest request = requestBuilder.build();
    readSession = client.createReadSession(request);
    LOG.info(
        "Sent BigQuery Storage API CreateReadSession request '{}'; received response '{}'.",
        request,
        readSession);
  }

  if (readSession.getStreamsList().isEmpty()) {
    // The underlying table is empty or all rows have been pruned.
    return ImmutableList.of();
  }

  Schema sessionSchema = new Schema.Parser().parse(readSession.getAvroSchema().getSchema());
  TableSchema trimmedSchema =
      BigQueryAvroUtils.trimBigQueryTableSchema(targetTable.getSchema(), sessionSchema);
  List<BigQueryStorageStreamSource<T>> sources = Lists.newArrayList();
  for (Stream stream : readSession.getStreamsList()) {
    sources.add(
        BigQueryStorageStreamSource.create(
            readSession, stream, trimmedSchema, parseFn, outputCoder, bqServices));
  }

  return ImmutableList.copyOf(sources);
}
 
Example 19
Source File: BigQueryIOReadTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
  List<TableRow> expected =
      ImmutableList.of(
          new TableRow().set("name", "a").set("number", 1L),
          new TableRow().set("name", "b").set("number", 2L),
          new TableRow().set("name", "c").set("number", 3L),
          new TableRow().set("name", "d").set("number", 4L),
          new TableRow().set("name", "e").set("number", 5L),
          new TableRow().set("name", "f").set("number", 6L));

  TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
  fakeDatasetService.createDataset("project", "data_set", "", "", null);
  fakeDatasetService.createTable(
      new Table()
          .setTableReference(table)
          .setSchema(
              new TableSchema()
                  .setFields(
                      ImmutableList.of(
                          new TableFieldSchema().setName("name").setType("STRING"),
                          new TableFieldSchema().setName("number").setType("INTEGER")))));
  fakeDatasetService.insertAll(table, expected, null);

  String stepUuid = "testStepUuid";
  BoundedSource<TableRow> bqSource =
      BigQueryTableSourceDef.create(fakeBqServices, ValueProvider.StaticValueProvider.of(table))
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  PipelineOptions options = PipelineOptionsFactory.create();
  options.setTempLocation(testFolder.getRoot().getAbsolutePath());
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");

  List<TableRow> read =
      convertStringsToLong(
          SourceTestUtils.readFromSplitsOfSource(bqSource, 0L /* ignored */, options));
  assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));

  List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
  assertEquals(2, sources.size());
  // Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
  sources = bqSource.split(200, options);
  assertEquals(2, sources.size());

  // A repeated call to split() should not have caused a duplicate extract job.
  assertEquals(1, fakeJobService.getNumExtractJobCalls());
}
 
Example 20
Source File: KryoCoderProvider.java    From beam with Apache License 2.0 2 votes vote down vote up
/**
 * Create a new {@link KryoCoderProvider}.
 *
 * @param pipelineOptions Options used for coder setup. See {@link KryoOptions} for more details.
 * @param registrars {@link KryoRegistrar}s which are used to register classes with underlying
 *     kryo instance
 * @return A newly created {@link KryoCoderProvider}
 */
public static KryoCoderProvider of(
    PipelineOptions pipelineOptions, List<KryoRegistrar> registrars) {
  final KryoOptions kryoOptions = pipelineOptions.as(KryoOptions.class);
  return new KryoCoderProvider(KryoCoder.of(kryoOptions, registrars));
}