org.apache.beam.sdk.options.ValueProvider Java Examples

The following examples show how to use org.apache.beam.sdk.options.ValueProvider. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TextStreamingPipeline.java    From dlp-dataflow-deidentification with Apache License 2.0 6 votes vote down vote up
public TextFileReader(
    String kmsKeyProjectName,
    ValueProvider<String> fileDecryptKeyRing,
    ValueProvider<String> fileDecryptKey,
    ValueProvider<Integer> batchSize,
    ValueProvider<String> cSek,
    ValueProvider<String> cSekhash)
    throws IOException, GeneralSecurityException {
  this.batchSize = batchSize;
  this.kmsKeyProjectName = kmsKeyProjectName;
  this.fileDecryptKey = fileDecryptKey;
  this.fileDecryptKeyName = fileDecryptKeyRing;
  this.cSek = cSek;
  this.cSekhash = cSekhash;
  this.customerSuppliedKey = false;
  this.key = null;
}
 
Example #2
Source File: TextRowToMutation.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
public TextRowToMutation(
    PCollectionView<Ddl> ddlView,
    PCollectionView<Map<String, List<TableManifest.Column>>> tableColumnsView,
    ValueProvider<Character> columnDelimiter,
    ValueProvider<Character> fieldQualifier,
    ValueProvider<Boolean> trailingDelimiter,
    ValueProvider<Character> escape,
    ValueProvider<String> nullString,
    ValueProvider<String> dateFormat,
    ValueProvider<String> timestampFormat) {
  this.ddlView = ddlView;
  this.tableColumnsView = tableColumnsView;
  this.columnDelimiter = columnDelimiter;
  this.fieldQualifier = fieldQualifier;
  this.trailingDelimiter = trailingDelimiter;
  this.escape = escape;
  this.nullString = nullString;
  this.dateFormat = dateFormat;
  this.timestampFormat = timestampFormat;
}
 
Example #3
Source File: CassandraIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Specify the local DC used by the load balancing policy. */
public Write<T> withLocalDc(String localDc) {
  checkArgument(
      localDc != null,
      "CassandraIO."
          + getMutationTypeName()
          + "().withLocalDc(localDc) called with null"
          + " localDc");
  return withLocalDc(ValueProvider.StaticValueProvider.of(localDc));
}
 
Example #4
Source File: DLPTextToBigQueryStreaming.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
@Description(
    "DLP API has a limit for payload size of 524KB /api call. "
        + "That's why dataflow process will need to chunk it. User will have to decide "
        + "on how they would like to batch the request depending on number of rows "
        + "and how big each row is.")
@Required
ValueProvider<Integer> getBatchSize();
 
Example #5
Source File: BigQueryToTFRecord.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link BigQueryToTFRecord#applyTrainTestValSplit} method transforms the PCollection by
 * randomly partitioning it into PCollections for each dataset.
 */
static PCollectionList<byte[]> applyTrainTestValSplit(PCollection<byte[]> input,
    ValueProvider<Float> trainingPercentage,
    ValueProvider<Float> testingPercentage,
    ValueProvider<Float> validationPercentage,
    Random rand) {
  return input
      .apply(Partition.of(
          3,
          (Partition.PartitionFn<byte[]>) (number, numPartitions) -> {
            Float train = trainingPercentage.get();
            Float test = testingPercentage.get();
            Float validation = validationPercentage.get();
            Double d = rand.nextDouble();
            if (train + test + validation != 1) {
              throw new RuntimeException(String.format("Train %.2f, Test %.2f, Validation"
                  + " %.2f percentages must add up to 100 percent", train, test, validation));
            }
            if (d < train) {
              return 0;
            } else if (d >= train && d < train + test) {
              return 1;
            } else {
              return 2;
            }
          }));
}
 
Example #6
Source File: DisplayDataTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testInaccessibleValueProvider() {
  DisplayData data =
      DisplayData.from(
          new HasDisplayData() {
            @Override
            public void populateDisplayData(DisplayData.Builder builder) {
              builder.add(
                  DisplayData.item(
                      "foo",
                      new ValueProvider<String>() {
                        @Override
                        public boolean isAccessible() {
                          return false;
                        }

                        @Override
                        public String get() {
                          return "bar";
                        }

                        @Override
                        public String toString() {
                          return "toString";
                        }
                      }));
            }
          });

  assertThat(data.items(), hasSize(1));
  assertThat(data, hasDisplayItem("foo", "toString"));
}
 
Example #7
Source File: SpannerConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
public static ExportTransform create(
    ValueProvider<String> table,
    SpannerConfig spannerConfig,
    ValueProvider<String> textWritePrefix) {
  return ExportTransform.builder()
      .table(table)
      .spannerConfig(spannerConfig)
      .textWritePrefix(textWritePrefix)
      .build();
}
 
Example #8
Source File: ExportTransform.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
private SchemaBasedDynamicDestinations(
    PCollectionView<Map<String, SerializableSchemaSupplier>> avroSchemas,
    PCollectionView<String> uniqueIdView,
    ValueProvider<ResourceId> baseDir) {
  this.avroSchemas = avroSchemas;
  this.uniqueIdView = uniqueIdView;
  this.baseDir = baseDir;
}
 
Example #9
Source File: BatchLoads.java    From beam with Apache License 2.0 5 votes vote down vote up
BatchLoads(
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    boolean singletonTable,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    Coder<DestinationT> destinationCoder,
    ValueProvider<String> customGcsTempLocation,
    @Nullable ValueProvider<String> loadJobProjectId,
    boolean ignoreUnknownValues,
    Coder<ElementT> elementCoder,
    RowWriterFactory<ElementT, DestinationT> rowWriterFactory,
    @Nullable String kmsKey,
    boolean clusteringEnabled,
    boolean useAvroLogicalTypes) {
  bigQueryServices = new BigQueryServicesImpl();
  this.writeDisposition = writeDisposition;
  this.createDisposition = createDisposition;
  this.singletonTable = singletonTable;
  this.dynamicDestinations = dynamicDestinations;
  this.destinationCoder = destinationCoder;
  this.maxNumWritersPerBundle = DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE;
  this.maxFileSize = DEFAULT_MAX_FILE_SIZE;
  this.numFileShards = DEFAULT_NUM_FILE_SHARDS;
  this.maxFilesPerPartition = DEFAULT_MAX_FILES_PER_PARTITION;
  this.maxBytesPerPartition = DEFAULT_MAX_BYTES_PER_PARTITION;
  this.triggeringFrequency = null;
  this.customGcsTempLocation = customGcsTempLocation;
  this.loadJobProjectId = loadJobProjectId;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.useAvroLogicalTypes = useAvroLogicalTypes;
  this.elementCoder = elementCoder;
  this.kmsKey = kmsKey;
  this.rowWriterFactory = rowWriterFactory;
  this.clusteringEnabled = clusteringEnabled;
  schemaUpdateOptions = Collections.emptySet();
}
 
Example #10
Source File: DualInputNestedValueProvider.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
public DualInputNestedValueProvider(
    ValueProvider<FirstT> valueX,
    ValueProvider<SecondT> valueY,
    SerializableFunction<TranslatorInput<FirstT, SecondT>, T> translator) {
  this.valueX = valueX;
  this.valueY = valueY;
  this.translator = translator;
}
 
Example #11
Source File: CSVStreamingPipelineTest.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
@Test
public void testNewTracker() {
  CSVContentProcessorDoFn csv =
      new CSVContentProcessorDoFn(ValueProvider.StaticValueProvider.of(2));
  OffsetRange off = new OffsetRange(2, 5);
  org.apache.beam.sdk.transforms.splittabledofn.OffsetRangeTracker offTrack = csv.newTracker(off);
  assertEquals(offTrack.currentRestriction(), off);
}
 
Example #12
Source File: BigQueryIOStorageReadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testTableSourceInitialSplit_EmptyTable() throws Exception {
  fakeDatasetService.createDataset("foo.com:project", "dataset", "", "", null);
  TableReference tableRef = BigQueryHelpers.parseTableSpec("foo.com:project:dataset.table");

  Table table =
      new Table()
          .setTableReference(tableRef)
          .setNumBytes(1024L * 1024L)
          .setSchema(new TableSchema());

  fakeDatasetService.createTable(table);

  CreateReadSessionRequest expectedRequest =
      CreateReadSessionRequest.newBuilder()
          .setParent("projects/project-id")
          .setTableReference(BigQueryHelpers.toTableRefProto(tableRef))
          .setRequestedStreams(1024)
          .setShardingStrategy(ShardingStrategy.BALANCED)
          .build();

  ReadSession emptyReadSession = ReadSession.newBuilder().build();
  StorageClient fakeStorageClient = mock(StorageClient.class);
  when(fakeStorageClient.createReadSession(expectedRequest)).thenReturn(emptyReadSession);

  BigQueryStorageTableSource<TableRow> tableSource =
      BigQueryStorageTableSource.create(
          ValueProvider.StaticValueProvider.of(tableRef),
          null,
          null,
          null,
          new TableRowParser(),
          TableRowJsonCoder.of(),
          new FakeBigQueryServices()
              .withDatasetService(fakeDatasetService)
              .withStorageClient(fakeStorageClient));

  List<? extends BoundedSource<TableRow>> sources = tableSource.split(1024L, options);
  assertTrue(sources.isEmpty());
}
 
Example #13
Source File: FileBasedSink.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Construct a {@link FileBasedSink} with the given temp directory, producing uncompressed files.
 */
@Experimental(Kind.FILESYSTEM)
public FileBasedSink(
    ValueProvider<ResourceId> tempDirectoryProvider,
    DynamicDestinations<?, DestinationT, OutputT> dynamicDestinations) {
  this(tempDirectoryProvider, dynamicDestinations, Compression.UNCOMPRESSED);
}
 
Example #14
Source File: CassandraIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Specify the Cassandra instance port number where to write data. */
public Write<T> withPort(int port) {
  checkArgument(
      port > 0,
      "CassandraIO."
          + getMutationTypeName()
          + "().withPort(port) called with invalid port "
          + "number (%s)",
      port);
  return withPort(ValueProvider.StaticValueProvider.of(port));
}
 
Example #15
Source File: SplunkIO.java    From beam with Apache License 2.0 4 votes vote down vote up
CreateKeysFn(ValueProvider<Integer> specifiedParallelism) {
  this.specifiedParallelism = specifiedParallelism;
}
 
Example #16
Source File: CassandraIO.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Specify the local DC used for the load balancing. */
public Read<T> withLocalDc(ValueProvider<String> localDc) {
  return builder().setLocalDc(localDc).build();
}
 
Example #17
Source File: SinkOptions.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
@Description("Output to write to (path to file or directory, Pubsub topic, etc.)")
@Validation.Required
ValueProvider<String> getOutput();
 
Example #18
Source File: DecoderOptions.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
@Description("If set to true, assume that all private keys are encrypted with the associated"
    + " KMS resourceId. Otherwise ignore KMS and assume all private keys are stored in plaintext."
    + " This may be used for debugging.")
@Default.Boolean(true)
ValueProvider<Boolean> getAetKmsEnabled();
 
Example #19
Source File: DatastoreConverters.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Description("Namespace of the Datastore entity")
ValueProvider<String> getDatastoreWriteNamespace();
 
Example #20
Source File: BigQueryIOStorageQueryTest.java    From beam with Apache License 2.0 4 votes vote down vote up
private void doQuerySourceInitialSplit(
    long bundleSize, int requestedStreamCount, int expectedStreamCount) throws Exception {

  TableReference sourceTableRef = BigQueryHelpers.parseTableSpec("project:dataset.table");

  fakeDatasetService.createDataset(
      sourceTableRef.getProjectId(),
      sourceTableRef.getDatasetId(),
      "asia-northeast1",
      "Fake plastic tree^H^H^H^Htables",
      null);

  fakeDatasetService.createTable(
      new Table().setTableReference(sourceTableRef).setLocation("asia-northeast1"));

  Table queryResultTable =
      new Table()
          .setSchema(
              new TableSchema()
                  .setFields(
                      ImmutableList.of(
                          new TableFieldSchema().setName("name").setType("STRING"),
                          new TableFieldSchema().setName("number").setType("INTEGER"))))
          .setNumBytes(1024L * 1024L);

  String encodedQuery = FakeBigQueryServices.encodeQueryResult(queryResultTable);

  fakeJobService.expectDryRunQuery(
      options.getProject(),
      encodedQuery,
      new JobStatistics()
          .setQuery(
              new JobStatistics2()
                  .setTotalBytesProcessed(1024L * 1024L)
                  .setReferencedTables(ImmutableList.of(sourceTableRef))));

  String stepUuid = "testStepUuid";

  TableReference tempTableReference =
      createTempTableReference(
          options.getProject(),
          createJobIdToken(options.getJobName(), stepUuid),
          Optional.empty());

  CreateReadSessionRequest expectedRequest =
      CreateReadSessionRequest.newBuilder()
          .setParent("projects/" + options.getProject())
          .setTableReference(BigQueryHelpers.toTableRefProto(tempTableReference))
          .setRequestedStreams(requestedStreamCount)
          .setShardingStrategy(ShardingStrategy.BALANCED)
          .build();

  Schema sessionSchema =
      SchemaBuilder.record("__root__")
          .fields()
          .name("name")
          .type()
          .nullable()
          .stringType()
          .noDefault()
          .name("number")
          .type()
          .nullable()
          .longType()
          .noDefault()
          .endRecord();

  ReadSession.Builder builder =
      ReadSession.newBuilder()
          .setAvroSchema(AvroSchema.newBuilder().setSchema(sessionSchema.toString()));
  for (int i = 0; i < expectedStreamCount; i++) {
    builder.addStreams(Stream.newBuilder().setName("stream-" + i));
  }

  StorageClient fakeStorageClient = mock(StorageClient.class);
  when(fakeStorageClient.createReadSession(expectedRequest)).thenReturn(builder.build());

  BigQueryStorageQuerySource<TableRow> querySource =
      BigQueryStorageQuerySource.create(
          stepUuid,
          ValueProvider.StaticValueProvider.of(encodedQuery),
          /* flattenResults = */ true,
          /* useLegacySql = */ true,
          /* priority = */ QueryPriority.BATCH,
          /* location = */ null,
          /* queryTempDataset = */ null,
          /* kmsKey = */ null,
          new TableRowParser(),
          TableRowJsonCoder.of(),
          new FakeBigQueryServices()
              .withDatasetService(fakeDatasetService)
              .withJobService(fakeJobService)
              .withStorageClient(fakeStorageClient));

  List<? extends BoundedSource<TableRow>> sources = querySource.split(bundleSize, options);
  assertEquals(expectedStreamCount, sources.size());
}
 
Example #21
Source File: BigtableToParquet.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Description("The Bigtable instance id that contains the table to export.")
ValueProvider<String> getBigtableInstanceId();
 
Example #22
Source File: BigtableToAvro.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Description("The Bigtable instance id that contains the table to export.")
ValueProvider<String> getBigtableInstanceId();
 
Example #23
Source File: RedisConnectionConfiguration.java    From beam with Apache License 2.0 4 votes vote down vote up
public static RedisConnectionConfiguration create(
    ValueProvider<String> host, ValueProvider<Integer> port) {
  return create().withHost(host).withPort(port);
}
 
Example #24
Source File: RedisConnectionConfiguration.java    From beam with Apache License 2.0 4 votes vote down vote up
/** See {@link RedisConnectionConfiguration#withHost(String)}. */
public RedisConnectionConfiguration withHost(ValueProvider<String> host) {
  return builder().setHost(host).build();
}
 
Example #25
Source File: RepublisherOptions.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
@Description("Duration for which document IDs should be stored for deduplication."
    + " Allowed formats are: Ns (for seconds, example: 5s),"
    + " Nm (for minutes, example: 12m), Nh (for hours, example: 2h)."
    + " Can be omitted if --redisUri is unset.")
@Default.String("24h")
ValueProvider<String> getDeduplicateExpireDuration();
 
Example #26
Source File: FileIO.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Like {@link #withPrefix(String)} but with a {@link ValueProvider}. */
public Write<DestinationT, UserT> withPrefix(ValueProvider<String> prefix) {
  checkArgument(prefix != null, "prefix can not be null");
  return toBuilder().setFilenamePrefix(prefix).build();
}
 
Example #27
Source File: TokenizePipelineOptions.java    From dlp-dataflow-deidentification with Apache License 2.0 4 votes vote down vote up
@Description("DataSet Spec")
ValueProvider<String> getDataset();
 
Example #28
Source File: PubsubToBigQueryTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link PubSubToBigQuery} pipeline end-to-end. */
@Test
public void testPubsubToBigQueryE2E() throws Exception {
  // Test input
  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final PubsubMessage message =
      new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event"));

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  final FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Parameters
  ValueProvider<String> transformPath = pipeline.newProvider(TRANSFORM_FILE_PATH);
  ValueProvider<String> transformFunction = pipeline.newProvider("transform");

  PubSubToBigQuery.Options options =
      PipelineOptionsFactory.create().as(PubSubToBigQuery.Options.class);

  options.setJavascriptTextTransformGcsPath(transformPath);
  options.setJavascriptTextTransformFunctionName(transformFunction);

  // Build pipeline
  PCollectionTuple transformOut =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(message, timestamp))
                  .withCoder(PubsubMessageWithAttributesCoder.of()))
          .apply("ConvertMessageToTableRow", new PubsubMessageToTableRow(options));

  // Assert
  PAssert.that(transformOut.get(PubSubToBigQuery.UDF_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubToBigQuery.TRANSFORM_OUT))
      .satisfies(
          collection -> {
            TableRow result = collection.iterator().next();
            assertThat(result.get("ticker"), is(equalTo("GOOGL")));
            assertThat(result.get("price"), is(equalTo(1006.94)));
            return null;
          });

  // Execute pipeline
  pipeline.run();
}
 
Example #29
Source File: BigtableToAvro.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unused")
void setBigtableProjectId(ValueProvider<String> projectId);
 
Example #30
Source File: CassandraToBigtable.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Description(
    "If true, a large row is split into multiple MutateRows requests. When a row is"
        + " split across requests, updates are not atomic. ")
ValueProvider<Boolean> getSplitLargeRows();