org.apache.beam.sdk.coders.AtomicCoder Java Examples

The following examples show how to use org.apache.beam.sdk.coders.AtomicCoder. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: PCollectionTranslationTest.java From beam with Apache License 2.0

6 votes

@Override
public Coder<BoundedWindow> windowCoder() {
  return new AtomicCoder<BoundedWindow>() {
    @Override
    public void verifyDeterministic() {}

    @Override
    public void encode(BoundedWindow value, OutputStream outStream) throws IOException {
      VarInt.encode(value.maxTimestamp().getMillis(), outStream);
    }

    @Override
    public BoundedWindow decode(InputStream inStream) throws IOException {
      final Instant ts = new Instant(VarInt.decodeLong(inStream));
      return new BoundedWindow() {
        @Override
        public Instant maxTimestamp() {
          return ts;
        }
      };
    }
  };
}

Example #2

Source File: CoderTypeSerializerTest.java From beam with Apache License 2.0

6 votes

@Test
public void shouldWriteAndReadSnapshotForAnonymousClassCoder() throws Exception {
  AtomicCoder<String> anonymousClassCoder =
      new AtomicCoder<String>() {

        @Override
        public void encode(String value, OutputStream outStream) {}

        @Override
        public String decode(InputStream inStream) {
          return "";
        }
      };

  testWriteAndReadConfigSnapshot(anonymousClassCoder);
}

Example #3

Source File: StreamingWriteTables.java From beam with Apache License 2.0

4 votes

private <T> PCollection<T> writeAndGetErrors(
    PCollection<KV<TableDestination, ElementT>> input,
    TupleTag<T> failedInsertsTag,
    AtomicCoder<T> coder,
    ErrorContainer<T> errorContainer) {
  BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
  int numShards = options.getNumStreamingKeys();

  // A naive implementation would be to simply stream data directly to BigQuery.
  // However, this could occasionally lead to duplicated data, e.g., when
  // a VM that runs this code is restarted and the code is re-run.

  // The above risk is mitigated in this implementation by relying on
  // BigQuery built-in best effort de-dup mechanism.

  // To use this mechanism, each input TableRow is tagged with a generated
  // unique id, which is then passed to BigQuery and used to ignore duplicates
  // We create 50 keys per BigQuery table to generate output on. This is few enough that we
  // get good batching into BigQuery's insert calls, and enough that we can max out the
  // streaming insert quota.
  PCollection<KV<ShardedKey<String>, TableRowInfo<ElementT>>> tagged =
      input
          .apply("ShardTableWrites", ParDo.of(new GenerateShardedTable<>(numShards)))
          .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), elementCoder))
          .apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds<>()))
          .setCoder(
              KvCoder.of(
                  ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of(elementCoder)));

  TupleTag<Void> mainOutputTag = new TupleTag<>("mainOutput");

  // To prevent having the same TableRow processed more than once with regenerated
  // different unique ids, this implementation relies on "checkpointing", which is
  // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
  // performed by Reshuffle.
  PCollectionTuple tuple =
      tagged
          .apply(Reshuffle.of())
          // Put in the global window to ensure that DynamicDestinations side inputs are accessed
          // correctly.
          .apply(
              "GlobalWindow",
              Window.<KV<ShardedKey<String>, TableRowInfo<ElementT>>>into(new GlobalWindows())
                  .triggering(DefaultTrigger.of())
                  .discardingFiredPanes())
          .apply(
              "StreamingWrite",
              ParDo.of(
                      new StreamingWriteFn<>(
                          bigQueryServices,
                          retryPolicy,
                          failedInsertsTag,
                          errorContainer,
                          skipInvalidRows,
                          ignoreUnknownValues,
                          ignoreInsertIds,
                          toTableRow))
                  .withOutputTags(mainOutputTag, TupleTagList.of(failedInsertsTag)));
  PCollection<T> failedInserts = tuple.get(failedInsertsTag);
  failedInserts.setCoder(coder);
  return failedInserts;
}