Java Code Examples for org.apache.beam.sdk.values.PCollection#setCoder()

The following examples show how to use org.apache.beam.sdk.values.PCollection#setCoder() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PCollectionCustomCoderTest.java    From beam with Apache License 2.0 6 votes vote down vote up
public Pipeline pipelineWith(CustomTestCoder coder) throws Exception {
  List<String> pipelineContents =
      Arrays.asList("String", "Testing", "Custom", "Coder", "In", "Beam");

  // Create input.
  PCollection<String> customCoderPC =
      pipeline
          .begin()
          .apply("ReadStrings", Create.of(pipelineContents))
          .setCoder(coder)
          .apply(Reshuffle.viaRandomKey());
  PCollection<String> fixedCoderPC =
      customCoderPC.apply("Identity", ParDo.of(new IdentityDoFn()));
  fixedCoderPC.setCoder(StringUtf8Coder.of());
  ContentReader r = ContentReader.elementsEqual(pipelineContents);
  // PAssert.that relies on the last coder added to the PCollection, so we
  // need to create an identity ParDo with a valid coder.
  PAssert.that(fixedCoderPC).satisfies(r);

  return pipeline;
}
 
Example 2
Source File: DynamoDBIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PBegin input) {
  checkArgument((getScanRequestFn() != null), "withScanRequestFn() is required");
  checkArgument((getAwsClientsProvider() != null), "withAwsClientsProvider() is required");
  ScanRequest scanRequest = getScanRequestFn().apply(null);
  checkArgument(
      (scanRequest.getTotalSegments() != null && scanRequest.getTotalSegments() > 0),
      "TotalSegments is required with withScanRequestFn() and greater zero");

  PCollection<Read<T>> splits =
      (PCollection<Read<T>>)
          input.apply("Create", Create.of(this)).apply("Split", ParDo.of(new SplitFn()));
  splits.setCoder(SerializableCoder.of(new TypeDescriptor<Read<T>>() {}));

  PCollection<T> output =
      (PCollection<T>)
          splits
              .apply("Reshuffle", Reshuffle.viaRandomKey())
              .apply("Read", ParDo.of(new ReadFn()));
  output.setCoder(getCoder());
  return output;
}
 
Example 3
Source File: BatchViewOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>> expand(
    PCollection<T> input) {
  @SuppressWarnings("unchecked")
  Coder<W> windowCoder = (Coder<W>) input.getWindowingStrategy().getWindowFn().windowCoder();
  PCollection<KV<Integer, KV<W, WindowedValue<T>>>> rval =
      input.apply(
          ParDo.of(new UseWindowHashAsKeyAndWindowAsSortKeyDoFn<T, W>(ismCoderForHash)));
  rval.setCoder(
      KvCoder.of(
          VarIntCoder.of(),
          KvCoder.of(windowCoder, FullWindowedValueCoder.of(input.getCoder(), windowCoder))));
  return rval.apply(new GroupByKeyAndSortValuesOnly<>());
}
 
Example 4
Source File: LazyAvroCoderTest.java    From components with Apache License 2.0 5 votes vote down vote up
/**
 * Basic use of the LazyAvroCoder with the default schema supplier.
 */
@Test
public void testBasic() {
    // Create a PCollection of simple records, and assign it to be encoded with a LazyAvroCoder.
    PCollection<IndexedRecord> a = p.apply("a", RowGeneratorIO.read().withSchema(SampleSchemas.recordSimple()));
    a.setCoder(LazyAvroCoder.of());

    // Construct the a job looks like (a and c are collections of IndexedRecords):
    //
    // a ----> b ----> c ----> d
    // |
    // \-> b2

    // Trigger a transformation that requires the data to be shuffled and run the pipelne.
    PCollection<KV<IndexedRecord, Long>> b = a.apply("b", Count.<IndexedRecord> perElement());
    PCollection<IndexedRecord> c = b.apply("c", Keys.<IndexedRecord> create());
    c.setCoder(LazyAvroCoder.of());
    PCollection<KV<IndexedRecord, Long>> d = c.apply("d", Count.<IndexedRecord> perElement());

    PCollection<KV<IndexedRecord, Long>> b2 = a.apply("b2", Count.<IndexedRecord> perElement());

    p.run().waitUntilFinish();

    // No exception should have occurred.

    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), hasSize(2));
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(),
            contains(SampleSchemas.recordSimple(), SampleSchemas.recordSimple()));

    // Check that the reset cleans the supplier.
    LazyAvroCoder.resetSchemaSupplier();
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), emptyIterable());
}
 
Example 5
Source File: SimpleRecordFormatParquetIO.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PDone write(PCollection<IndexedRecord> in) {
    ParquetHdfsFileSink sink = new ParquetHdfsFileSink(doAs, path, overwrite, mergeOutput);
    sink.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration());

    PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(ParDo.of(new FormatParquet()));
    pc1 = pc1.setCoder(KvCoder.of(VoidCoder.of(), LazyAvroCoder.of()));
    if (in.isBounded() == PCollection.IsBounded.BOUNDED) {
        return pc1.apply(Write.to(sink));
    } else {
        return pc1.apply(UnboundedWrite.of(sink));
    }
}
 
Example 6
Source File: LazyAvroCoderTest.java    From components with Apache License 2.0 5 votes vote down vote up
/**
 * Exactly the same test as {@link #testBasic()} but reusing the LazyAvroCoder.
 */
@Test
public void testBasicReuse() {
    LazyAvroCoder lac = LazyAvroCoder.of();

    // Create a PCollection of simple records, and assign it to be encoded with a LazyAvroCoder.
    PCollection<IndexedRecord> a = p.apply("a", RowGeneratorIO.read().withSchema(SampleSchemas.recordSimple()));
    a.setCoder(lac);

    // Construct the a job looks like (a and c are collections of IndexedRecords):
    //
    // a ----> b ----> c ----> d
    // |
    // \-> b2

    // Trigger a transformation that requires the data to be shuffled and run the pipelne.
    PCollection<KV<IndexedRecord, Long>> b = a.apply("b", Count.<IndexedRecord> perElement());
    PCollection<IndexedRecord> c = b.apply("c", Keys.<IndexedRecord> create());
    c.setCoder(lac);
    PCollection<KV<IndexedRecord, Long>> d = c.apply("d", Count.<IndexedRecord> perElement());

    PCollection<KV<IndexedRecord, Long>> b2 = a.apply("b2", Count.<IndexedRecord> perElement());

    p.run().waitUntilFinish();

    // No exception should have occurred.

    // Only one schema was registered.
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), hasSize(1));
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), contains(SampleSchemas.recordSimple()));
}
 
Example 7
Source File: SimpleRecordFormatAvroIO.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PDone write(PCollection<IndexedRecord> in) {
    LazyAvroKeyWrapper lakw = LazyAvroKeyWrapper.of();
    AvroHdfsFileSink sink = new AvroHdfsFileSink(doAs, path, overwrite, mergeOutput);
    sink.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration());

    PCollection<KV<AvroKey<IndexedRecord>, NullWritable>> pc1 = in.apply(ParDo.of(new FormatAvro()));
    pc1 = pc1.setCoder(KvCoder.of(lakw, WritableCoder.of(NullWritable.class)));

    if (in.isBounded() == PCollection.IsBounded.BOUNDED) {
        return pc1.apply(Write.to(sink));
    } else {
        return pc1.apply(UnboundedWrite.of(sink));
    }
}
 
Example 8
Source File: ForwardingPTransform.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public OutputT expand(InputT input) {
  OutputT res = delegate().expand(input);
  if (res instanceof PCollection) {
    PCollection pc = (PCollection) res;
    try {
      pc.setCoder(delegate().getDefaultOutputCoder(input, pc));
    } catch (CannotProvideCoderException e) {
      // Let coder inference happen later.
    }
  }
  return res;
}
 
Example 9
Source File: SnowflakeIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  checkArguments();

  String stagingBucketDir = String.format("%s/%s/", getStagingBucketName(), WRITE_TMP_PATH);

  PCollection<String> out = write(input, stagingBucketDir);
  out.setCoder(StringUtf8Coder.of());

  return PDone.in(out.getPipeline());
}
 
Example 10
Source File: SnowflakeIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PBegin input) {
  checkArguments();

  String tmpDirName = makeTmpDirName();
  String stagingBucketDir = String.format("%s/%s/", getStagingBucketName(), tmpDirName);

  PCollection<Void> emptyCollection = input.apply(Create.of((Void) null));

  PCollection<T> output =
      emptyCollection
          .apply(
              ParDo.of(
                  new CopyIntoStageFn(
                      getDataSourceProviderFn(),
                      getQuery(),
                      getTable(),
                      getStorageIntegrationName(),
                      stagingBucketDir,
                      getSnowflakeService())))
          .apply(Reshuffle.viaRandomKey())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(readFiles())
          .apply(ParDo.of(new MapCsvToStringArrayFn()))
          .apply(ParDo.of(new MapStringArrayToUserDataFn<>(getCsvMapper())));

  output.setCoder(getCoder());

  emptyCollection
      .apply(Wait.on(output))
      .apply(ParDo.of(new CleanTmpFilesFromGcsFn(stagingBucketDir)));
  return output;
}
 
Example 11
Source File: FlattenEvaluatorFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testFlattenInMemoryEvaluatorWithEmptyPCollectionList() throws Exception {
  PCollectionList<Integer> list = PCollectionList.empty(p);

  PCollection<Integer> flattened = list.apply(Flatten.pCollections());
  flattened.setCoder(VarIntCoder.of());

  EvaluationContext evaluationContext = mock(EvaluationContext.class);
  when(evaluationContext.createBundle(flattened))
      .thenReturn(bundleFactory.createBundle(flattened));

  FlattenEvaluatorFactory factory = new FlattenEvaluatorFactory(evaluationContext);
  AppliedPTransform<?, ?, ?> flattendProducer = DirectGraphs.getProducer(flattened);
  TransformEvaluator<Integer> emptyEvaluator =
      factory.forApplication(
          flattendProducer,
          bundleFactory.createRootBundle().commit(BoundedWindow.TIMESTAMP_MAX_VALUE));

  TransformResult<Integer> leftSideResult = emptyEvaluator.finishBundle();

  CommittedBundle<?> outputBundle =
      Iterables.getOnlyElement(leftSideResult.getOutputBundles()).commit(Instant.now());
  assertThat(outputBundle.getElements(), emptyIterable());
  assertThat(
      leftSideResult.getTransform(),
      Matchers.<AppliedPTransform<?, ?, ?>>equalTo(flattendProducer));
}
 
Example 12
Source File: PubsubIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PBegin input) {
  if (getTopicProvider() == null && getSubscriptionProvider() == null) {
    throw new IllegalStateException(
        "Need to set either the topic or the subscription for " + "a PubsubIO.Read transform");
  }
  if (getTopicProvider() != null && getSubscriptionProvider() != null) {
    throw new IllegalStateException(
        "Can't set both the topic and the subscription for " + "a PubsubIO.Read transform");
  }

  @Nullable
  ValueProvider<TopicPath> topicPath =
      getTopicProvider() == null
          ? null
          : NestedValueProvider.of(getTopicProvider(), new TopicPathTranslator());
  @Nullable
  ValueProvider<SubscriptionPath> subscriptionPath =
      getSubscriptionProvider() == null
          ? null
          : NestedValueProvider.of(getSubscriptionProvider(), new SubscriptionPathTranslator());
  PubsubUnboundedSource source =
      new PubsubUnboundedSource(
          getClock(),
          getPubsubClientFactory(),
          null /* always get project from runtime PipelineOptions */,
          topicPath,
          subscriptionPath,
          getTimestampAttribute(),
          getIdAttribute(),
          getNeedsAttributes(),
          getNeedsMessageId());
  PCollection<T> read =
      input.apply(source).apply(MapElements.into(new TypeDescriptor<T>() {}).via(getParseFn()));
  return read.setCoder(getCoder());
}
 
Example 13
Source File: AvroUtilsTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testAvroSchemaCoders() {
  Pipeline pipeline = Pipeline.create();
  org.apache.avro.Schema schema =
      org.apache.avro.Schema.createRecord(
          "TestSubRecord",
          "TestSubRecord doc",
          "org.apache.beam.sdk.schemas.utils",
          false,
          getAvroSubSchemaFields());
  GenericRecord record =
      new GenericRecordBuilder(getAvroSubSchema("simple"))
          .set("bool", true)
          .set("int", 42)
          .build();

  PCollection<GenericRecord> records =
      pipeline.apply(Create.of(record).withCoder(AvroCoder.of(schema)));
  assertFalse(records.hasSchema());
  records.setCoder(AvroUtils.schemaCoder(schema));
  assertTrue(records.hasSchema());
  CoderProperties.coderSerializable(records.getCoder());

  AvroGeneratedUser user = new AvroGeneratedUser("foo", 42, "green");
  PCollection<AvroGeneratedUser> users =
      pipeline.apply(Create.of(user).withCoder(AvroCoder.of(AvroGeneratedUser.class)));
  assertFalse(users.hasSchema());
  users.setCoder(AvroUtils.schemaCoder((AvroCoder<AvroGeneratedUser>) users.getCoder()));
  assertTrue(users.hasSchema());
  CoderProperties.coderSerializable(users.getCoder());
}
 
Example 14
Source File: ParDo.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<OutputT> expand(PCollection<? extends InputT> input) {
  SchemaRegistry schemaRegistry = input.getPipeline().getSchemaRegistry();
  CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry();
  finishSpecifyingStateSpecs(fn, coderRegistry, schemaRegistry, input.getCoder());
  TupleTag<OutputT> mainOutput = new TupleTag<>(MAIN_OUTPUT_TAG);
  PCollection<OutputT> res =
      input.apply(withOutputTags(mainOutput, TupleTagList.empty())).get(mainOutput);

  TypeDescriptor<OutputT> outputTypeDescriptor = getFn().getOutputTypeDescriptor();
  try {
    res.setSchema(
        schemaRegistry.getSchema(outputTypeDescriptor),
        outputTypeDescriptor,
        schemaRegistry.getToRowFunction(outputTypeDescriptor),
        schemaRegistry.getFromRowFunction(outputTypeDescriptor));
  } catch (NoSuchSchemaException e) {
    try {
      res.setCoder(
          coderRegistry.getCoder(
              outputTypeDescriptor,
              getFn().getInputTypeDescriptor(),
              ((PCollection<InputT>) input).getCoder()));
    } catch (CannotProvideCoderException e2) {
      // Ignore and leave coder unset.
    }
  }

  return res;
}
 
Example 15
Source File: WithKeys.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<K, V>> expand(PCollection<V> in) {
  PCollection<KV<K, V>> result =
      in.apply(
          "AddKeys",
          MapElements.via(
              new SimpleFunction<V, KV<K, V>>() {
                @Override
                public KV<K, V> apply(V element) {
                  return KV.of(fn.apply(element), element);
                }
              }));

  try {
    Coder<K> keyCoder;
    CoderRegistry coderRegistry = in.getPipeline().getCoderRegistry();
    if (keyType == null) {
      keyCoder = coderRegistry.getOutputCoder(fn, in.getCoder());
    } else {
      keyCoder = coderRegistry.getCoder(keyType);
    }
    // TODO: Remove when we can set the coder inference context.
    result.setCoder(KvCoder.of(keyCoder, in.getCoder()));
  } catch (CannotProvideCoderException exc) {
    // let lazy coder inference have a try
  }

  return result;
}
 
Example 16
Source File: Deduplicate.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  WithKeys<IdT, T> withKeys = WithKeys.of(fn);
  if (type != null) {
    withKeys = withKeys.withKeyType(type);
  }
  PCollection<KV<IdT, T>> inputWithKey = input.apply(withKeys);
  if (coder != null) {
    inputWithKey.setCoder(KvCoder.of(coder, input.getCoder()));
  }
  return inputWithKey
      .apply(new KeyedValues<>(timeDomain, duration))
      .apply(org.apache.beam.sdk.transforms.Values.create());
}
 
Example 17
Source File: StreamingWriteTables.java    From beam with Apache License 2.0 4 votes vote down vote up
private <T> PCollection<T> writeAndGetErrors(
    PCollection<KV<TableDestination, ElementT>> input,
    TupleTag<T> failedInsertsTag,
    AtomicCoder<T> coder,
    ErrorContainer<T> errorContainer) {
  BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
  int numShards = options.getNumStreamingKeys();

  // A naive implementation would be to simply stream data directly to BigQuery.
  // However, this could occasionally lead to duplicated data, e.g., when
  // a VM that runs this code is restarted and the code is re-run.

  // The above risk is mitigated in this implementation by relying on
  // BigQuery built-in best effort de-dup mechanism.

  // To use this mechanism, each input TableRow is tagged with a generated
  // unique id, which is then passed to BigQuery and used to ignore duplicates
  // We create 50 keys per BigQuery table to generate output on. This is few enough that we
  // get good batching into BigQuery's insert calls, and enough that we can max out the
  // streaming insert quota.
  PCollection<KV<ShardedKey<String>, TableRowInfo<ElementT>>> tagged =
      input
          .apply("ShardTableWrites", ParDo.of(new GenerateShardedTable<>(numShards)))
          .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), elementCoder))
          .apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds<>()))
          .setCoder(
              KvCoder.of(
                  ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of(elementCoder)));

  TupleTag<Void> mainOutputTag = new TupleTag<>("mainOutput");

  // To prevent having the same TableRow processed more than once with regenerated
  // different unique ids, this implementation relies on "checkpointing", which is
  // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
  // performed by Reshuffle.
  PCollectionTuple tuple =
      tagged
          .apply(Reshuffle.of())
          // Put in the global window to ensure that DynamicDestinations side inputs are accessed
          // correctly.
          .apply(
              "GlobalWindow",
              Window.<KV<ShardedKey<String>, TableRowInfo<ElementT>>>into(new GlobalWindows())
                  .triggering(DefaultTrigger.of())
                  .discardingFiredPanes())
          .apply(
              "StreamingWrite",
              ParDo.of(
                      new StreamingWriteFn<>(
                          bigQueryServices,
                          retryPolicy,
                          failedInsertsTag,
                          errorContainer,
                          skipInvalidRows,
                          ignoreUnknownValues,
                          ignoreInsertIds,
                          toTableRow))
                  .withOutputTags(mainOutputTag, TupleTagList.of(failedInsertsTag)));
  PCollection<T> failedInserts = tuple.get(failedInsertsTag);
  failedInserts.setCoder(coder);
  return failedInserts;
}
 
Example 18
Source File: ParDo.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<? extends InputT> input) {
  // SplittableDoFn should be forbidden on the runner-side.
  validateWindowType(input, fn);

  // Use coder registry to determine coders for all StateSpec defined in the fn signature.
  CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry();
  SchemaRegistry schemaRegistry = input.getPipeline().getSchemaRegistry();
  finishSpecifyingStateSpecs(fn, coderRegistry, schemaRegistry, input.getCoder());

  DoFnSignature signature = DoFnSignatures.getSignature(fn.getClass());
  if (signature.usesState() || signature.usesTimers()) {
    validateStateApplicableForInput(fn, input);
  }

  validateSideInputTypes(sideInputs, fn);

  // TODO: We should validate OutputReceiver<Row> only happens if the output PCollection
  // as schema. However coder/schema inference may not have happened yet at this point.
  // Need to figure out where to validate this.

  PCollectionTuple outputs =
      PCollectionTuple.ofPrimitiveOutputsInternal(
          input.getPipeline(),
          TupleTagList.of(mainOutputTag).and(additionalOutputTags.getAll()),
          // TODO
          Collections.emptyMap(),
          input.getWindowingStrategy(),
          input.isBounded().and(signature.isBoundedPerElement()));
  @SuppressWarnings("unchecked")
  Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
  for (PCollection<?> out : outputs.getAll().values()) {
    try {
      out.setCoder(
          (Coder)
              coderRegistry.getCoder(
                  out.getTypeDescriptor(), getFn().getInputTypeDescriptor(), inputCoder));
    } catch (CannotProvideCoderException e) {
      // Ignore and let coder inference happen later.
    }
  }

  // The fn will likely be an instance of an anonymous subclass
  // such as DoFn<Integer, String> { }, thus will have a high-fidelity
  // TypeDescriptor for the output type.
  outputs.get(mainOutputTag).setTypeDescriptor(getFn().getOutputTypeDescriptor());

  return outputs;
}
 
Example 19
Source File: BatchViewOverrides.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <K, V, W extends BoundedWindow, ViewT> PCollection<?> applyForMapLike(
    DataflowRunner runner,
    PCollection<KV<K, V>> input,
    PCollectionView<ViewT> view,
    boolean uniqueKeysExpected)
    throws NonDeterministicException {

  @SuppressWarnings("unchecked")
  Coder<W> windowCoder = (Coder<W>) input.getWindowingStrategy().getWindowFn().windowCoder();

  @SuppressWarnings({"rawtypes", "unchecked"})
  KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();

  // If our key coder is deterministic, we can use the key portion of each KV
  // part of a composite key containing the window , key and index.
  inputCoder.getKeyCoder().verifyDeterministic();

  IsmRecordCoder<WindowedValue<V>> ismCoder =
      coderForMapLike(windowCoder, inputCoder.getKeyCoder(), inputCoder.getValueCoder());

  // Create the various output tags representing the main output containing the data stream
  // and the additional outputs containing the metadata about the size and entry set.
  TupleTag<IsmRecord<WindowedValue<V>>> mainOutputTag = new TupleTag<>();
  TupleTag<KV<Integer, KV<W, Long>>> outputForSizeTag = new TupleTag<>();
  TupleTag<KV<Integer, KV<W, K>>> outputForEntrySetTag = new TupleTag<>();

  // Process all the elements grouped by key hash, and sorted by key and then window
  // outputting to all the outputs defined above.
  PCollectionTuple outputTuple =
      input
          .apply("GBKaSVForData", new GroupByKeyHashAndSortByKeyAndWindow<K, V, W>(ismCoder))
          .apply(
              ParDo.of(
                      new ToIsmRecordForMapLikeDoFn<>(
                          outputForSizeTag,
                          outputForEntrySetTag,
                          windowCoder,
                          inputCoder.getKeyCoder(),
                          ismCoder,
                          uniqueKeysExpected))
                  .withOutputTags(
                      mainOutputTag,
                      TupleTagList.of(
                          ImmutableList.of(outputForSizeTag, outputForEntrySetTag))));

  // Set the coder on the main data output.
  PCollection<IsmRecord<WindowedValue<V>>> perHashWithReifiedWindows =
      outputTuple.get(mainOutputTag);
  perHashWithReifiedWindows.setCoder(ismCoder);

  // Set the coder on the metadata output for size and process the entries
  // producing a [META, Window, 0L] record per window storing the number of unique keys
  // for each window.
  PCollection<KV<Integer, KV<W, Long>>> outputForSize = outputTuple.get(outputForSizeTag);
  outputForSize.setCoder(
      KvCoder.of(VarIntCoder.of(), KvCoder.of(windowCoder, VarLongCoder.of())));
  PCollection<IsmRecord<WindowedValue<V>>> windowMapSizeMetadata =
      outputForSize
          .apply("GBKaSVForSize", new GroupByKeyAndSortValuesOnly<>())
          .apply(ParDo.of(new ToIsmMetadataRecordForSizeDoFn<K, V, W>(windowCoder)));
  windowMapSizeMetadata.setCoder(ismCoder);

  // Set the coder on the metadata output destined to build the entry set and process the
  // entries producing a [META, Window, Index] record per window key pair storing the key.
  PCollection<KV<Integer, KV<W, K>>> outputForEntrySet = outputTuple.get(outputForEntrySetTag);
  outputForEntrySet.setCoder(
      KvCoder.of(VarIntCoder.of(), KvCoder.of(windowCoder, inputCoder.getKeyCoder())));
  PCollection<IsmRecord<WindowedValue<V>>> windowMapKeysMetadata =
      outputForEntrySet
          .apply("GBKaSVForKeys", new GroupByKeyAndSortValuesOnly<>())
          .apply(
              ParDo.of(
                  new ToIsmMetadataRecordForKeyDoFn<K, V, W>(
                      inputCoder.getKeyCoder(), windowCoder)));
  windowMapKeysMetadata.setCoder(ismCoder);

  // Set that all these outputs should be materialized using an indexed format.
  runner.addPCollectionRequiringIndexedFormat(perHashWithReifiedWindows);
  runner.addPCollectionRequiringIndexedFormat(windowMapSizeMetadata);
  runner.addPCollectionRequiringIndexedFormat(windowMapKeysMetadata);

  PCollectionList<IsmRecord<WindowedValue<V>>> outputs =
      PCollectionList.of(
          ImmutableList.of(
              perHashWithReifiedWindows, windowMapSizeMetadata, windowMapKeysMetadata));

  PCollection<IsmRecord<WindowedValue<V>>> flattenedOutputs =
      Pipeline.applyTransform(outputs, Flatten.pCollections());
  flattenedOutputs.apply(CreateDataflowView.forBatch(view));
  return flattenedOutputs;
}
 
Example 20
Source File: View.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<Void, T>> expand(PCollection<T> input) {
  PCollection output = input.apply(ParDo.of(new VoidKeyToMultimapMaterializationDoFn<>()));
  output.setCoder(KvCoder.of(VoidCoder.of(), input.getCoder()));
  return output;
}