Java Code Examples for org.apache.beam.sdk.values.PCollection#setCoder()

The following examples show how to use org.apache.beam.sdk.values.PCollection#setCoder() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: PCollectionCustomCoderTest.java From beam with Apache License 2.0

6 votes

public Pipeline pipelineWith(CustomTestCoder coder) throws Exception {
  List<String> pipelineContents =
      Arrays.asList("String", "Testing", "Custom", "Coder", "In", "Beam");

  // Create input.
  PCollection<String> customCoderPC =
      pipeline
          .begin()
          .apply("ReadStrings", Create.of(pipelineContents))
          .setCoder(coder)
          .apply(Reshuffle.viaRandomKey());
  PCollection<String> fixedCoderPC =
      customCoderPC.apply("Identity", ParDo.of(new IdentityDoFn()));
  fixedCoderPC.setCoder(StringUtf8Coder.of());
  ContentReader r = ContentReader.elementsEqual(pipelineContents);
  // PAssert.that relies on the last coder added to the PCollection, so we
  // need to create an identity ParDo with a valid coder.
  PAssert.that(fixedCoderPC).satisfies(r);

  return pipeline;
}

Example 2

Source File: DynamoDBIO.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<T> expand(PBegin input) {
  checkArgument((getScanRequestFn() != null), "withScanRequestFn() is required");
  checkArgument((getAwsClientsProvider() != null), "withAwsClientsProvider() is required");
  ScanRequest scanRequest = getScanRequestFn().apply(null);
  checkArgument(
      (scanRequest.getTotalSegments() != null && scanRequest.getTotalSegments() > 0),
      "TotalSegments is required with withScanRequestFn() and greater zero");

  PCollection<Read<T>> splits =
      (PCollection<Read<T>>)
          input.apply("Create", Create.of(this)).apply("Split", ParDo.of(new SplitFn()));
  splits.setCoder(SerializableCoder.of(new TypeDescriptor<Read<T>>() {}));

  PCollection<T> output =
      (PCollection<T>)
          splits
              .apply("Reshuffle", Reshuffle.viaRandomKey())
              .apply("Read", ParDo.of(new ReadFn()));
  output.setCoder(getCoder());
  return output;
}

Example 3

Source File: BatchViewOverrides.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>> expand(
    PCollection<T> input) {
  @SuppressWarnings("unchecked")
  Coder<W> windowCoder = (Coder<W>) input.getWindowingStrategy().getWindowFn().windowCoder();
  PCollection<KV<Integer, KV<W, WindowedValue<T>>>> rval =
      input.apply(
          ParDo.of(new UseWindowHashAsKeyAndWindowAsSortKeyDoFn<T, W>(ismCoderForHash)));
  rval.setCoder(
      KvCoder.of(
          VarIntCoder.of(),
          KvCoder.of(windowCoder, FullWindowedValueCoder.of(input.getCoder(), windowCoder))));
  return rval.apply(new GroupByKeyAndSortValuesOnly<>());
}

Example 4

Source File: LazyAvroCoderTest.java From components with Apache License 2.0

5 votes

/**
 * Basic use of the LazyAvroCoder with the default schema supplier.
 */
@Test
public void testBasic() {
    // Create a PCollection of simple records, and assign it to be encoded with a LazyAvroCoder.
    PCollection<IndexedRecord> a = p.apply("a", RowGeneratorIO.read().withSchema(SampleSchemas.recordSimple()));
    a.setCoder(LazyAvroCoder.of());

    // Construct the a job looks like (a and c are collections of IndexedRecords):
    //
    // a ----> b ----> c ----> d
    // |
    // \-> b2

    // Trigger a transformation that requires the data to be shuffled and run the pipelne.
    PCollection<KV<IndexedRecord, Long>> b = a.apply("b", Count.<IndexedRecord> perElement());
    PCollection<IndexedRecord> c = b.apply("c", Keys.<IndexedRecord> create());
    c.setCoder(LazyAvroCoder.of());
    PCollection<KV<IndexedRecord, Long>> d = c.apply("d", Count.<IndexedRecord> perElement());

    PCollection<KV<IndexedRecord, Long>> b2 = a.apply("b2", Count.<IndexedRecord> perElement());

    p.run().waitUntilFinish();

    // No exception should have occurred.

    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), hasSize(2));
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(),
            contains(SampleSchemas.recordSimple(), SampleSchemas.recordSimple()));

    // Check that the reset cleans the supplier.
    LazyAvroCoder.resetSchemaSupplier();
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), emptyIterable());
}

Example 5

Source File: SimpleRecordFormatParquetIO.java From components with Apache License 2.0

5 votes

@Override
public PDone write(PCollection<IndexedRecord> in) {
    ParquetHdfsFileSink sink = new ParquetHdfsFileSink(doAs, path, overwrite, mergeOutput);
    sink.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration());

    PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(ParDo.of(new FormatParquet()));
    pc1 = pc1.setCoder(KvCoder.of(VoidCoder.of(), LazyAvroCoder.of()));
    if (in.isBounded() == PCollection.IsBounded.BOUNDED) {
        return pc1.apply(Write.to(sink));
    } else {
        return pc1.apply(UnboundedWrite.of(sink));
    }
}

Example 6

Source File: LazyAvroCoderTest.java From components with Apache License 2.0

5 votes

/**
 * Exactly the same test as {@link #testBasic()} but reusing the LazyAvroCoder.
 */
@Test
public void testBasicReuse() {
    LazyAvroCoder lac = LazyAvroCoder.of();

    // Create a PCollection of simple records, and assign it to be encoded with a LazyAvroCoder.
    PCollection<IndexedRecord> a = p.apply("a", RowGeneratorIO.read().withSchema(SampleSchemas.recordSimple()));
    a.setCoder(lac);

    // Construct the a job looks like (a and c are collections of IndexedRecords):
    //
    // a ----> b ----> c ----> d
    // |
    // \-> b2

    // Trigger a transformation that requires the data to be shuffled and run the pipelne.
    PCollection<KV<IndexedRecord, Long>> b = a.apply("b", Count.<IndexedRecord> perElement());
    PCollection<IndexedRecord> c = b.apply("c", Keys.<IndexedRecord> create());
    c.setCoder(lac);
    PCollection<KV<IndexedRecord, Long>> d = c.apply("d", Count.<IndexedRecord> perElement());

    PCollection<KV<IndexedRecord, Long>> b2 = a.apply("b2", Count.<IndexedRecord> perElement());

    p.run().waitUntilFinish();

    // No exception should have occurred.

    // Only one schema was registered.
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), hasSize(1));
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), contains(SampleSchemas.recordSimple()));
}

Example 7

Source File: SimpleRecordFormatAvroIO.java From components with Apache License 2.0

5 votes

@Override
public PDone write(PCollection<IndexedRecord> in) {
    LazyAvroKeyWrapper lakw = LazyAvroKeyWrapper.of();
    AvroHdfsFileSink sink = new AvroHdfsFileSink(doAs, path, overwrite, mergeOutput);
    sink.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration());

    PCollection<KV<AvroKey<IndexedRecord>, NullWritable>> pc1 = in.apply(ParDo.of(new FormatAvro()));
    pc1 = pc1.setCoder(KvCoder.of(lakw, WritableCoder.of(NullWritable.class)));

    if (in.isBounded() == PCollection.IsBounded.BOUNDED) {
        return pc1.apply(Write.to(sink));
    } else {
        return pc1.apply(UnboundedWrite.of(sink));
    }
}

Example 8

Source File: ForwardingPTransform.java From beam with Apache License 2.0

5 votes

@Override
public OutputT expand(InputT input) {
  OutputT res = delegate().expand(input);
  if (res instanceof PCollection) {
    PCollection pc = (PCollection) res;
    try {
      pc.setCoder(delegate().getDefaultOutputCoder(input, pc));
    } catch (CannotProvideCoderException e) {
      // Let coder inference happen later.
    }
  }
  return res;
}

Example 9

Source File: SnowflakeIO.java From beam with Apache License 2.0

5 votes

@Override
public PDone expand(PCollection<T> input) {
  checkArguments();

  String stagingBucketDir = String.format("%s/%s/", getStagingBucketName(), WRITE_TMP_PATH);

  PCollection<String> out = write(input, stagingBucketDir);
  out.setCoder(StringUtf8Coder.of());

  return PDone.in(out.getPipeline());
}

Example 10

Source File: SnowflakeIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<T> expand(PBegin input) {
  checkArguments();

  String tmpDirName = makeTmpDirName();
  String stagingBucketDir = String.format("%s/%s/", getStagingBucketName(), tmpDirName);

  PCollection<Void> emptyCollection = input.apply(Create.of((Void) null));

  PCollection<T> output =
      emptyCollection
          .apply(
              ParDo.of(
                  new CopyIntoStageFn(
                      getDataSourceProviderFn(),
                      getQuery(),
                      getTable(),
                      getStorageIntegrationName(),
                      stagingBucketDir,
                      getSnowflakeService())))
          .apply(Reshuffle.viaRandomKey())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(readFiles())
          .apply(ParDo.of(new MapCsvToStringArrayFn()))
          .apply(ParDo.of(new MapStringArrayToUserDataFn<>(getCsvMapper())));

  output.setCoder(getCoder());

  emptyCollection
      .apply(Wait.on(output))
      .apply(ParDo.of(new CleanTmpFilesFromGcsFn(stagingBucketDir)));
  return output;
}

Example 11

Source File: FlattenEvaluatorFactoryTest.java From beam with Apache License 2.0

5 votes

@Test
public void testFlattenInMemoryEvaluatorWithEmptyPCollectionList() throws Exception {
  PCollectionList<Integer> list = PCollectionList.empty(p);

  PCollection<Integer> flattened = list.apply(Flatten.pCollections());
  flattened.setCoder(VarIntCoder.of());

  EvaluationContext evaluationContext = mock(EvaluationContext.class);
  when(evaluationContext.createBundle(flattened))
      .thenReturn(bundleFactory.createBundle(flattened));

  FlattenEvaluatorFactory factory = new FlattenEvaluatorFactory(evaluationContext);
  AppliedPTransform<?, ?, ?> flattendProducer = DirectGraphs.getProducer(flattened);
  TransformEvaluator<Integer> emptyEvaluator =
      factory.forApplication(
          flattendProducer,
          bundleFactory.createRootBundle().commit(BoundedWindow.TIMESTAMP_MAX_VALUE));

  TransformResult<Integer> leftSideResult = emptyEvaluator.finishBundle();

  CommittedBundle<?> outputBundle =
      Iterables.getOnlyElement(leftSideResult.getOutputBundles()).commit(Instant.now());
  assertThat(outputBundle.getElements(), emptyIterable());
  assertThat(
      leftSideResult.getTransform(),
      Matchers.<AppliedPTransform<?, ?, ?>>equalTo(flattendProducer));
}

Example 12

Source File: PubsubIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<T> expand(PBegin input) {
  if (getTopicProvider() == null && getSubscriptionProvider() == null) {
    throw new IllegalStateException(
        "Need to set either the topic or the subscription for " + "a PubsubIO.Read transform");
  }
  if (getTopicProvider() != null && getSubscriptionProvider() != null) {
    throw new IllegalStateException(
        "Can't set both the topic and the subscription for " + "a PubsubIO.Read transform");
  }

  @Nullable
  ValueProvider<TopicPath> topicPath =
      getTopicProvider() == null
          ? null
          : NestedValueProvider.of(getTopicProvider(), new TopicPathTranslator());
  @Nullable
  ValueProvider<SubscriptionPath> subscriptionPath =
      getSubscriptionProvider() == null
          ? null
          : NestedValueProvider.of(getSubscriptionProvider(), new SubscriptionPathTranslator());
  PubsubUnboundedSource source =
      new PubsubUnboundedSource(
          getClock(),
          getPubsubClientFactory(),
          null /* always get project from runtime PipelineOptions */,
          topicPath,
          subscriptionPath,
          getTimestampAttribute(),
          getIdAttribute(),
          getNeedsAttributes(),
          getNeedsMessageId());
  PCollection<T> read =
      input.apply(source).apply(MapElements.into(new TypeDescriptor<T>() {}).via(getParseFn()));
  return read.setCoder(getCoder());
}

Example 13

Source File: AvroUtilsTest.java From beam with Apache License 2.0

5 votes

@Test
public void testAvroSchemaCoders() {
  Pipeline pipeline = Pipeline.create();
  org.apache.avro.Schema schema =
      org.apache.avro.Schema.createRecord(
          "TestSubRecord",
          "TestSubRecord doc",
          "org.apache.beam.sdk.schemas.utils",
          false,
          getAvroSubSchemaFields());
  GenericRecord record =
      new GenericRecordBuilder(getAvroSubSchema("simple"))
          .set("bool", true)
          .set("int", 42)
          .build();

  PCollection<GenericRecord> records =
      pipeline.apply(Create.of(record).withCoder(AvroCoder.of(schema)));
  assertFalse(records.hasSchema());
  records.setCoder(AvroUtils.schemaCoder(schema));
  assertTrue(records.hasSchema());
  CoderProperties.coderSerializable(records.getCoder());

  AvroGeneratedUser user = new AvroGeneratedUser("foo", 42, "green");
  PCollection<AvroGeneratedUser> users =
      pipeline.apply(Create.of(user).withCoder(AvroCoder.of(AvroGeneratedUser.class)));
  assertFalse(users.hasSchema());
  users.setCoder(AvroUtils.schemaCoder((AvroCoder<AvroGeneratedUser>) users.getCoder()));
  assertTrue(users.hasSchema());
  CoderProperties.coderSerializable(users.getCoder());
}

Example 14

Source File: ParDo.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<OutputT> expand(PCollection<? extends InputT> input) {
  SchemaRegistry schemaRegistry = input.getPipeline().getSchemaRegistry();
  CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry();
  finishSpecifyingStateSpecs(fn, coderRegistry, schemaRegistry, input.getCoder());
  TupleTag<OutputT> mainOutput = new TupleTag<>(MAIN_OUTPUT_TAG);
  PCollection<OutputT> res =
      input.apply(withOutputTags(mainOutput, TupleTagList.empty())).get(mainOutput);

  TypeDescriptor<OutputT> outputTypeDescriptor = getFn().getOutputTypeDescriptor();
  try {
    res.setSchema(
        schemaRegistry.getSchema(outputTypeDescriptor),
        outputTypeDescriptor,
        schemaRegistry.getToRowFunction(outputTypeDescriptor),
        schemaRegistry.getFromRowFunction(outputTypeDescriptor));
  } catch (NoSuchSchemaException e) {
    try {
      res.setCoder(
          coderRegistry.getCoder(
              outputTypeDescriptor,
              getFn().getInputTypeDescriptor(),
              ((PCollection<InputT>) input).getCoder()));
    } catch (CannotProvideCoderException e2) {
      // Ignore and leave coder unset.
    }
  }

  return res;
}

Example 15

Source File: WithKeys.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<KV<K, V>> expand(PCollection<V> in) {
  PCollection<KV<K, V>> result =
      in.apply(
          "AddKeys",
          MapElements.via(
              new SimpleFunction<V, KV<K, V>>() {
                @Override
                public KV<K, V> apply(V element) {
                  return KV.of(fn.apply(element), element);
                }
              }));

  try {
    Coder<K> keyCoder;
    CoderRegistry coderRegistry = in.getPipeline().getCoderRegistry();
    if (keyType == null) {
      keyCoder = coderRegistry.getOutputCoder(fn, in.getCoder());
    } else {
      keyCoder = coderRegistry.getCoder(keyType);
    }
    // TODO: Remove when we can set the coder inference context.
    result.setCoder(KvCoder.of(keyCoder, in.getCoder()));
  } catch (CannotProvideCoderException exc) {
    // let lazy coder inference have a try
  }

  return result;
}

Example 16

Source File: Deduplicate.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<T> expand(PCollection<T> input) {
  WithKeys<IdT, T> withKeys = WithKeys.of(fn);
  if (type != null) {
    withKeys = withKeys.withKeyType(type);
  }
  PCollection<KV<IdT, T>> inputWithKey = input.apply(withKeys);
  if (coder != null) {
    inputWithKey.setCoder(KvCoder.of(coder, input.getCoder()));
  }
  return inputWithKey
      .apply(new KeyedValues<>(timeDomain, duration))
      .apply(org.apache.beam.sdk.transforms.Values.create());
}

Example 17

Source File: StreamingWriteTables.java From beam with Apache License 2.0

4 votes

private <T> PCollection<T> writeAndGetErrors(
    PCollection<KV<TableDestination, ElementT>> input,
    TupleTag<T> failedInsertsTag,
    AtomicCoder<T> coder,
    ErrorContainer<T> errorContainer) {
  BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
  int numShards = options.getNumStreamingKeys();

  // A naive implementation would be to simply stream data directly to BigQuery.
  // However, this could occasionally lead to duplicated data, e.g., when
  // a VM that runs this code is restarted and the code is re-run.

  // The above risk is mitigated in this implementation by relying on
  // BigQuery built-in best effort de-dup mechanism.

  // To use this mechanism, each input TableRow is tagged with a generated
  // unique id, which is then passed to BigQuery and used to ignore duplicates
  // We create 50 keys per BigQuery table to generate output on. This is few enough that we
  // get good batching into BigQuery's insert calls, and enough that we can max out the
  // streaming insert quota.
  PCollection<KV<ShardedKey<String>, TableRowInfo<ElementT>>> tagged =
      input
          .apply("ShardTableWrites", ParDo.of(new GenerateShardedTable<>(numShards)))
          .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), elementCoder))
          .apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds<>()))
          .setCoder(
              KvCoder.of(
                  ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of(elementCoder)));

  TupleTag<Void> mainOutputTag = new TupleTag<>("mainOutput");

  // To prevent having the same TableRow processed more than once with regenerated
  // different unique ids, this implementation relies on "checkpointing", which is
  // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
  // performed by Reshuffle.
  PCollectionTuple tuple =
      tagged
          .apply(Reshuffle.of())
          // Put in the global window to ensure that DynamicDestinations side inputs are accessed
          // correctly.
          .apply(
              "GlobalWindow",
              Window.<KV<ShardedKey<String>, TableRowInfo<ElementT>>>into(new GlobalWindows())
                  .triggering(DefaultTrigger.of())
                  .discardingFiredPanes())
          .apply(
              "StreamingWrite",
              ParDo.of(
                      new StreamingWriteFn<>(
                          bigQueryServices,
                          retryPolicy,
                          failedInsertsTag,
                          errorContainer,
                          skipInvalidRows,
                          ignoreUnknownValues,
                          ignoreInsertIds,
                          toTableRow))
                  .withOutputTags(mainOutputTag, TupleTagList.of(failedInsertsTag)));
  PCollection<T> failedInserts = tuple.get(failedInsertsTag);
  failedInserts.setCoder(coder);
  return failedInserts;
}

Example 18

Source File: ParDo.java From beam with Apache License 2.0

4 votes

@Override
public PCollectionTuple expand(PCollection<? extends InputT> input) {
  // SplittableDoFn should be forbidden on the runner-side.
  validateWindowType(input, fn);

  // Use coder registry to determine coders for all StateSpec defined in the fn signature.
  CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry();
  SchemaRegistry schemaRegistry = input.getPipeline().getSchemaRegistry();
  finishSpecifyingStateSpecs(fn, coderRegistry, schemaRegistry, input.getCoder());

  DoFnSignature signature = DoFnSignatures.getSignature(fn.getClass());
  if (signature.usesState() || signature.usesTimers()) {
    validateStateApplicableForInput(fn, input);
  }

  validateSideInputTypes(sideInputs, fn);

  // TODO: We should validate OutputReceiver<Row> only happens if the output PCollection
  // as schema. However coder/schema inference may not have happened yet at this point.
  // Need to figure out where to validate this.

  PCollectionTuple outputs =
      PCollectionTuple.ofPrimitiveOutputsInternal(
          input.getPipeline(),
          TupleTagList.of(mainOutputTag).and(additionalOutputTags.getAll()),
          // TODO
          Collections.emptyMap(),
          input.getWindowingStrategy(),
          input.isBounded().and(signature.isBoundedPerElement()));
  @SuppressWarnings("unchecked")
  Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
  for (PCollection<?> out : outputs.getAll().values()) {
    try {
      out.setCoder(
          (Coder)
              coderRegistry.getCoder(
                  out.getTypeDescriptor(), getFn().getInputTypeDescriptor(), inputCoder));
    } catch (CannotProvideCoderException e) {
      // Ignore and let coder inference happen later.
    }
  }

  // The fn will likely be an instance of an anonymous subclass
  // such as DoFn<Integer, String> { }, thus will have a high-fidelity
  // TypeDescriptor for the output type.
  outputs.get(mainOutputTag).setTypeDescriptor(getFn().getOutputTypeDescriptor());

  return outputs;
}

Example 19

Source File: BatchViewOverrides.java From beam with Apache License 2.0

4 votes

private static <K, V, W extends BoundedWindow, ViewT> PCollection<?> applyForMapLike(
    DataflowRunner runner,
    PCollection<KV<K, V>> input,
    PCollectionView<ViewT> view,
    boolean uniqueKeysExpected)
    throws NonDeterministicException {

  @SuppressWarnings("unchecked")
  Coder<W> windowCoder = (Coder<W>) input.getWindowingStrategy().getWindowFn().windowCoder();

  @SuppressWarnings({"rawtypes", "unchecked"})
  KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();

  // If our key coder is deterministic, we can use the key portion of each KV
  // part of a composite key containing the window , key and index.
  inputCoder.getKeyCoder().verifyDeterministic();

  IsmRecordCoder<WindowedValue<V>> ismCoder =
      coderForMapLike(windowCoder, inputCoder.getKeyCoder(), inputCoder.getValueCoder());

  // Create the various output tags representing the main output containing the data stream
  // and the additional outputs containing the metadata about the size and entry set.
  TupleTag<IsmRecord<WindowedValue<V>>> mainOutputTag = new TupleTag<>();
  TupleTag<KV<Integer, KV<W, Long>>> outputForSizeTag = new TupleTag<>();
  TupleTag<KV<Integer, KV<W, K>>> outputForEntrySetTag = new TupleTag<>();

  // Process all the elements grouped by key hash, and sorted by key and then window
  // outputting to all the outputs defined above.
  PCollectionTuple outputTuple =
      input
          .apply("GBKaSVForData", new GroupByKeyHashAndSortByKeyAndWindow<K, V, W>(ismCoder))
          .apply(
              ParDo.of(
                      new ToIsmRecordForMapLikeDoFn<>(
                          outputForSizeTag,
                          outputForEntrySetTag,
                          windowCoder,
                          inputCoder.getKeyCoder(),
                          ismCoder,
                          uniqueKeysExpected))
                  .withOutputTags(
                      mainOutputTag,
                      TupleTagList.of(
                          ImmutableList.of(outputForSizeTag, outputForEntrySetTag))));

  // Set the coder on the main data output.
  PCollection<IsmRecord<WindowedValue<V>>> perHashWithReifiedWindows =
      outputTuple.get(mainOutputTag);
  perHashWithReifiedWindows.setCoder(ismCoder);

  // Set the coder on the metadata output for size and process the entries
  // producing a [META, Window, 0L] record per window storing the number of unique keys
  // for each window.
  PCollection<KV<Integer, KV<W, Long>>> outputForSize = outputTuple.get(outputForSizeTag);
  outputForSize.setCoder(
      KvCoder.of(VarIntCoder.of(), KvCoder.of(windowCoder, VarLongCoder.of())));
  PCollection<IsmRecord<WindowedValue<V>>> windowMapSizeMetadata =
      outputForSize
          .apply("GBKaSVForSize", new GroupByKeyAndSortValuesOnly<>())
          .apply(ParDo.of(new ToIsmMetadataRecordForSizeDoFn<K, V, W>(windowCoder)));
  windowMapSizeMetadata.setCoder(ismCoder);

  // Set the coder on the metadata output destined to build the entry set and process the
  // entries producing a [META, Window, Index] record per window key pair storing the key.
  PCollection<KV<Integer, KV<W, K>>> outputForEntrySet = outputTuple.get(outputForEntrySetTag);
  outputForEntrySet.setCoder(
      KvCoder.of(VarIntCoder.of(), KvCoder.of(windowCoder, inputCoder.getKeyCoder())));
  PCollection<IsmRecord<WindowedValue<V>>> windowMapKeysMetadata =
      outputForEntrySet
          .apply("GBKaSVForKeys", new GroupByKeyAndSortValuesOnly<>())
          .apply(
              ParDo.of(
                  new ToIsmMetadataRecordForKeyDoFn<K, V, W>(
                      inputCoder.getKeyCoder(), windowCoder)));
  windowMapKeysMetadata.setCoder(ismCoder);

  // Set that all these outputs should be materialized using an indexed format.
  runner.addPCollectionRequiringIndexedFormat(perHashWithReifiedWindows);
  runner.addPCollectionRequiringIndexedFormat(windowMapSizeMetadata);
  runner.addPCollectionRequiringIndexedFormat(windowMapKeysMetadata);

  PCollectionList<IsmRecord<WindowedValue<V>>> outputs =
      PCollectionList.of(
          ImmutableList.of(
              perHashWithReifiedWindows, windowMapSizeMetadata, windowMapKeysMetadata));

  PCollection<IsmRecord<WindowedValue<V>>> flattenedOutputs =
      Pipeline.applyTransform(outputs, Flatten.pCollections());
  flattenedOutputs.apply(CreateDataflowView.forBatch(view));
  return flattenedOutputs;
}

Example 20

Source File: View.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<KV<Void, T>> expand(PCollection<T> input) {
  PCollection output = input.apply(ParDo.of(new VoidKeyToMultimapMaterializationDoFn<>()));
  output.setCoder(KvCoder.of(VoidCoder.of(), input.getCoder()));
  return output;
}