Java Code Examples for org.apache.beam.sdk.io.FileIO#Write

The following examples show how to use org.apache.beam.sdk.io.FileIO#Write . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public WithFailures.Result<PDone, PubsubMessage> expand(PCollection<PubsubMessage> input) {
  ValueProvider<DynamicPathTemplate> pathTemplate = NestedValueProvider.of(outputPrefix,
      DynamicPathTemplate::new);
  ValueProvider<String> staticPrefix = NestedValueProvider.of(pathTemplate,
      value -> value.staticPrefix);

  FileIO.Write<List<String>, PubsubMessage> write = FileIO
      .<List<String>, PubsubMessage>writeDynamic()
      // We can't pass the attribute map to by() directly since MapCoder isn't
      // deterministic;
      // instead, we extract an ordered list of the needed placeholder values.
      // That list is later available to withNaming() to determine output location.
      .by(message -> pathTemplate.get()
          .extractValuesFrom(DerivedAttributesMap.of(message.getAttributeMap())))
      .withDestinationCoder(ListCoder.of(StringUtf8Coder.of())) //
      .withCompression(compression) //
      .via(Contextful.fn(format::encodeSingleMessage), TextIO.sink()) //
      .to(staticPrefix) //
      .withNaming(placeholderValues -> NoColonFileNaming.defaultNaming(
          pathTemplate.get().replaceDynamicPart(placeholderValues), format.suffix()));

  if (inputType == InputType.pubsub) {
    // Passing a ValueProvider to withNumShards disables runner-determined sharding, so we
    // need to be careful to pass this only for streaming input (where runner-determined
    // sharding is not an option).
    write = write.withNumShards(numShards);
  }

  input //
      .apply(Window.<PubsubMessage>into(FixedWindows.of(windowDuration))
          // We allow lateness up to the maximum Cloud Pub/Sub retention of 7 days documented in
          // https://cloud.google.com/pubsub/docs/subscriber
          .withAllowedLateness(Duration.standardDays(7)) //
          .discardingFiredPanes())
      .apply(write);
  return WithFailures.Result.of(PDone.in(input.getPipeline()),
      EmptyErrors.in(input.getPipeline()));
}
 
Example 2
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
@Override
public WithFailures.Result<PDone, PubsubMessage> expand(PCollection<PubsubMessage> input) {
  ValueProvider<String> staticPrefix = NestedValueProvider.of(pathTemplate,
      value -> value.staticPrefix);

  List<String> placeholders = pathTemplate.get().getPlaceholderNames();
  if (!placeholders
      .containsAll(Arrays.asList("document_namespace", "document_type", "document_version"))) {
    throw new RuntimeException(
        "Path template must contain document namespace, type, and version");
  }

  AvroEncoder encoder = new AvroEncoder();

  // A ParDo is opted over a PTransform extending MapElementsWithErrors.
  // While this leads to manual error handling with output-tags, this allows
  // for side-input of the singleton SchemaStore PCollection.
  ParDo.MultiOutput<PubsubMessage, PubsubMessage> encodePayloadAsAvro = ParDo.of(encoder)
      .withOutputTags(successTag, TupleTagList.of(errorTag));

  FileIO.Write<List<String>, PubsubMessage> write = FileIO
      .<List<String>, PubsubMessage>writeDynamic() //
      .by(message -> pathTemplate.get().extractValuesFrom(message.getAttributeMap()))
      .withDestinationCoder(ListCoder.of(StringUtf8Coder.of())) //
      .withCompression(compression) //
      .via(Contextful.fn(encoder::getSink)) //
      .to(staticPrefix) //
      .withNaming(placeholderValues -> NoColonFileNaming
          .defaultNaming(pathTemplate.get().replaceDynamicPart(placeholderValues), ".avro"));

  if (inputType == InputType.pubsub) {
    // Passing a ValueProvider to withNumShards disables runner-determined sharding, so we
    // need to be careful to pass this only for streaming input (where runner-determined
    // sharding is not an option).
    write = write.withNumShards(numShards);
  }

  // Without this, we may run into `Inputs to Flatten had incompatible window windowFns`
  Window<PubsubMessage> window = Window.<PubsubMessage>into(FixedWindows.of(windowDuration))
      // We allow lateness up to the maximum Cloud Pub/Sub retention of 7 days documented in
      // https://cloud.google.com/pubsub/docs/subscriber
      .withAllowedLateness(Duration.standardDays(7)) //
      .discardingFiredPanes();

  PCollectionTuple results = input.apply("encodePayloadAsAvro", encodePayloadAsAvro);
  results.get(successTag).apply(window).apply(write);

  return WithFailures.Result.of(PDone.in(input.getPipeline()), results.get(errorTag));
}