Java Code Examples for org.apache.beam.sdk.io.FileBasedSink

The following examples show how to use org.apache.beam.sdk.io.FileBasedSink. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: incubator-nemo   Source File: WriteOneFilePerWindow.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public ResourceId windowedFilename(
  final int shardNumber,
  final int numShards,
  final BoundedWindow window,
  final PaneInfo paneInfo,
  final FileBasedSink.OutputFileHints outputFileHints) {
  System.out.println("Windowd file name: " + window);
  final IntervalWindow intervalWindow = (IntervalWindow) window;
  final String filename =
    String.format(
      "%s-%s-of-%s%s",
      filenamePrefixForWindow(intervalWindow),
      shardNumber,
      numShards,
      outputFileHints.getSuggestedFilenameSuffix());
  return baseFilename
    .getCurrentDirectory()
    .resolve(filename, ResolveOptions.StandardResolveOptions.RESOLVE_FILE);
}
 
Example 2
Source Project: dbeam   Source File: JdbcAvroIO.java    License: Apache License 2.0 6 votes vote down vote up
public static PTransform<PCollection<String>, WriteFilesResult<Void>> createWrite(
    String filenamePrefix, String filenameSuffix, Schema schema, JdbcAvroArgs jdbcAvroArgs) {
  filenamePrefix = filenamePrefix.replaceAll("/+$", "") + "/part";
  ValueProvider<ResourceId> prefixProvider =
      StaticValueProvider.of(FileBasedSink.convertToFileResourceIfPossible(filenamePrefix));
  FileBasedSink.FilenamePolicy filenamePolicy =
      DefaultFilenamePolicy.fromStandardParameters(
          prefixProvider, DEFAULT_SHARD_TEMPLATE, filenameSuffix, false);

  final DynamicAvroDestinations<String, Void, String> destinations =
      AvroIO.constantDestinations(
          filenamePolicy,
          schema,
          ImmutableMap.of(),
          // since Beam does not support zstandard
          CodecFactory.nullCodec(),
          SerializableFunctions.identity());
  final FileBasedSink<String, Void, String> sink =
      new JdbcAvroSink<>(prefixProvider, destinations, jdbcAvroArgs);
  return WriteFiles.to(sink);
}
 
Example 3
Source Project: beam   Source File: WriteFilesTranslationTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testEncodedProto() throws Exception {
  SdkComponents components = SdkComponents.create();
  components.registerEnvironment(Environments.createDockerEnvironment("java"));
  RunnerApi.WriteFilesPayload payload =
      WriteFilesTranslation.payloadForWriteFiles(writeFiles, components);

  assertThat(
      payload.getRunnerDeterminedSharding(),
      equalTo(
          writeFiles.getNumShardsProvider() == null && writeFiles.getComputeNumShards() == null));

  assertThat(payload.getWindowedWrites(), equalTo(writeFiles.getWindowedWrites()));

  assertThat(
      (FileBasedSink<String, Void, String>)
          WriteFilesTranslation.sinkFromProto(payload.getSink()),
      equalTo(writeFiles.getSink()));
}
 
Example 4
Source Project: nomulus   Source File: InvoicingPipeline.java    License: Apache License 2.0 6 votes vote down vote up
/** Returns an IO transform that writes detail reports to registrar-tld keyed CSV files. */
private TextIO.TypedWrite<BillingEvent, Params> writeDetailReports(
    ValueProvider<String> yearMonthProvider) {
  return TextIO.<BillingEvent>writeCustomType()
      .to(
          InvoicingUtils.makeDestinationFunction(
              String.format("%s/%s", billingBucketUrl, BillingModule.INVOICES_DIRECTORY),
              yearMonthProvider),
          InvoicingUtils.makeEmptyDestinationParams(billingBucketUrl + "/errors"))
      .withFormatFunction(BillingEvent::toCsv)
      .withoutSharding()
      .withTempDirectory(
          FileBasedSink.convertToFileResourceIfPossible(beamBucketUrl + "/temporary"))
      .withHeader(BillingEvent.getHeader())
      .withSuffix(".csv");
}
 
Example 5
Source Project: nomulus   Source File: InvoicingUtils.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns a function mapping from {@code BillingEvent} to filename {@code Params}.
 *
 * <p>Beam uses this to determine which file a given {@code BillingEvent} should get placed into.
 *
 * @param outputBucket the GCS bucket we're outputting reports to
 * @param yearMonthProvider a runtime provider for the yyyy-MM we're generating the invoice for
 */
static SerializableFunction<BillingEvent, Params> makeDestinationFunction(
    String outputBucket, ValueProvider<String> yearMonthProvider) {
  return billingEvent ->
      new Params()
          .withShardTemplate("")
          .withSuffix(".csv")
          .withBaseFilename(
              NestedValueProvider.of(
                  yearMonthProvider,
                  yearMonth ->
                      FileBasedSink.convertToFileResourceIfPossible(
                          String.format(
                              "%s/%s/%s",
                              outputBucket, yearMonth, billingEvent.toFilename(yearMonth)))));
}
 
Example 6
Source Project: nomulus   Source File: InvoicingUtilsTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDestinationFunction_generatesProperFileParams() {
  SerializableFunction<BillingEvent, Params> destinationFunction =
      InvoicingUtils.makeDestinationFunction("my/directory", StaticValueProvider.of("2017-10"));

  BillingEvent billingEvent = mock(BillingEvent.class);
  // We mock BillingEvent to make the test independent of the implementation of toFilename()
  when(billingEvent.toFilename(any())).thenReturn("invoice_details_2017-10_registrar_tld");

  assertThat(destinationFunction.apply(billingEvent))
      .isEqualTo(
          new Params()
              .withShardTemplate("")
              .withSuffix(".csv")
              .withBaseFilename(
                  FileBasedSink.convertToFileResourceIfPossible(
                      "my/directory/2017-10/invoice_details_2017-10_registrar_tld")));
}
 
Example 7
@Override
public PDone expand(PCollection<KV<String, String>> input) {

  PCollection<String> contents =
      input.apply(
          ParDo.of(
              new DoFn<KV<String, String>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  filenamePrefix = String.format("%s%s", filenamePrefix, c.element().getKey());
                  LOG.info("File Prefix {}", filenamePrefix);

                  c.output(c.element().getValue());
                }
              }));

  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites();

  if (numShards != null) {
    write = write.withNumShards(numShards);
  }

  return contents.apply(write);
}
 
Example 8
@Override
public PDone expand(PCollection<String> input) {

  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites();

  if (numShards != null) {
    write = write.withNumShards(numShards);
  }

  return input.apply(write);
}
 
Example 9
Source Project: incubator-nemo   Source File: WriteOneFilePerWindow.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(final PCollection<String> input) {
  final ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
    TextIO.write()
      .to(new PerWindowFiles(resource))
      .withTempDirectory(resource.getCurrentDirectory())
      .withWindowedWrites();
  if (numShards != null) {
    write = write.withNumShards(numShards);
  }
  return input.apply(write);
}
 
Example 10
Source Project: deployment-examples   Source File: WriteToText.java    License: MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<String> input) {
  // Verify that the input has a compatible window type.
  checkArgument(
      input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder());

  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);

  return input.apply(
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites()
          .withNumShards(3));
}
 
Example 11
Source Project: deployment-examples   Source File: WriteOneFilePerWindow.java    License: MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<String> input) {
  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites();
  if (numShards != null) {
    write = write.withNumShards(numShards);
  }
  return input.apply(write);
}
 
Example 12
Source Project: DataflowTemplates   Source File: WriteToGCSText.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to String using DoFn.
       */
      .apply(
          "Converting to String",
          ParDo.of(
              new DoFn<KV<String, String>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(c.element().getValue());
                }
              }))
      /*
       * Writing as text file using {@link TextIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       * The {@link withTempDirectory} option sets the base directory used to generate temporary files.
       */
      .apply(
          "Writing as Text",
          TextIO.write()
              .to(
                  new WindowedFilenamePolicy(
                      outputDirectory(),
                      outputFilenamePrefix(),
                      WriteToGCSUtility.SHARD_TEMPLATE,
                      WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.TEXT)))
              .withTempDirectory(
                  FileBasedSink.convertToFileResourceIfPossible(tempLocation())
                      .getCurrentDirectory())
              .withWindowedWrites()
              .withNumShards(numShards()));
}
 
Example 13
Source Project: DataflowTemplates   Source File: WriteToGCSAvro.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as avro file using {@link AvroIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       * The {@link withTempDirectory} option sets the base directory used to generate temporary files.
       */
      .apply(
          "Writing as Avro",
          AvroIO.writeGenericRecords(KeyValueToGenericRecordFn.SCHEMA)
              .to(
                  new WindowedFilenamePolicy(
                      outputDirectory(),
                      outputFilenamePrefix(),
                      WriteToGCSUtility.SHARD_TEMPLATE,
                      WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.AVRO)))
              .withTempDirectory(
                  FileBasedSink.convertToFileResourceIfPossible(tempLocation())
                      .getCurrentDirectory())
              .withWindowedWrites()
              .withNumShards(numShards()));
}
 
Example 14
Source Project: DataflowTemplates   Source File: PubsubToText.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return  The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *   1) Read string messages from PubSub
   *   2) Window the messages into minute intervals specified by the executor.
   *   3) Output the windowed files to GCS
   */
  pipeline
      .apply("Read PubSub Events", PubsubIO.readStrings().fromTopic(options.getInputTopic()))
      .apply(
          options.getWindowDuration() + " Window",
          Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration()))))

      // Apply windowed file writes. Use a NestedValueProvider because the filename
      // policy requires a resourceId generated from the input value at runtime.
      .apply(
          "Write File(s)",
          TextIO.write()
              .withWindowedWrites()
              .withNumShards(options.getNumShards())
              .to(
                  new WindowedFilenamePolicy(
                      options.getOutputDirectory(),
                      options.getOutputFilenamePrefix(),
                      options.getOutputShardTemplate(),
                      options.getOutputFilenameSuffix()))
              .withTempDirectory(NestedValueProvider.of(
                  maybeUseUserTempLocation(
                      options.getUserTempLocation(),
                      options.getOutputDirectory()),
                  (SerializableFunction<String, ResourceId>) input ->
                      FileBasedSink.convertToFileResourceIfPossible(input))));

  // Execute the pipeline and return the result.
  return pipeline.run();
}
 
Example 15
Source Project: DataflowTemplates   Source File: ExportTransform.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public FileBasedSink.FilenamePolicy getFilenamePolicy(final String destination) {
  final String uniqueId = sideInput(uniqueIdView);
  return DefaultFilenamePolicy.fromStandardParameters(
      ValueProvider.NestedValueProvider.of(
          baseDir,
          (SerializableFunction<ResourceId, ResourceId>)
              r ->
                  r.resolve(
                      GcsUtil.joinPath(uniqueId, destination + ".avro"),
                      ResolveOptions.StandardResolveOptions.RESOLVE_FILE)),
      null,
      null,
      false);
}
 
Example 16
Source Project: dbeam   Source File: JdbcAvroIO.java    License: Apache License 2.0 5 votes vote down vote up
private JdbcAvroWriteOperation(
    FileBasedSink<?, Void, String> sink,
    DynamicAvroDestinations<?, Void, String> dynamicDestinations,
    JdbcAvroArgs jdbcAvroArgs) {

  super(sink);
  this.dynamicDestinations = dynamicDestinations;
  this.jdbcAvroArgs = jdbcAvroArgs;
}
 
Example 17
Source Project: dbeam   Source File: JdbcAvroIO.java    License: Apache License 2.0 5 votes vote down vote up
JdbcAvroWriter(
    FileBasedSink.WriteOperation<Void, String> writeOperation,
    DynamicAvroDestinations<?, Void, String> dynamicDestinations,
    JdbcAvroArgs jdbcAvroArgs) {
  super(writeOperation, MimeTypes.BINARY);
  this.dynamicDestinations = dynamicDestinations;
  this.jdbcAvroArgs = jdbcAvroArgs;
  this.metering = JdbcAvroMetering.create();
}
 
Example 18
Source Project: beam   Source File: WriteToText.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<String> input) {
  // Verify that the input has a compatible window type.
  checkArgument(
      input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder());

  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);

  return input.apply(
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites()
          .withNumShards(3));
}
 
Example 19
Source Project: beam   Source File: WriteOneFilePerWindow.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<String> input) {
  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites();
  if (numShards != null) {
    write = write.withNumShards(numShards);
  }
  return input.apply(write);
}
 
Example 20
Source Project: beam   Source File: WriteFilesTranslation.java    License: Apache License 2.0 5 votes vote down vote up
@VisibleForTesting
static FileBasedSink<?, ?, ?> sinkFromProto(FunctionSpec sinkProto) throws IOException {
  checkArgument(
      sinkProto.getUrn().equals(CUSTOM_JAVA_FILE_BASED_SINK_URN),
      "Cannot extract %s instance from %s with URN %s",
      FileBasedSink.class.getSimpleName(),
      FunctionSpec.class.getSimpleName(),
      sinkProto.getUrn());

  byte[] serializedSink = sinkProto.getPayload().toByteArray();

  return (FileBasedSink<?, ?, ?>)
      SerializableUtils.deserializeFromByteArray(
          serializedSink, FileBasedSink.class.getSimpleName());
}
 
Example 21
Source Project: beam   Source File: WriteFilesTranslation.java    License: Apache License 2.0 5 votes vote down vote up
public static <UserT, DestinationT, OutputT> FileBasedSink<UserT, DestinationT, OutputT> getSink(
    AppliedPTransform<
            PCollection<UserT>,
            WriteFilesResult<DestinationT>,
            ? extends PTransform<PCollection<UserT>, WriteFilesResult<DestinationT>>>
        transform)
    throws IOException {
  return (FileBasedSink<UserT, DestinationT, OutputT>)
      sinkFromProto(getWriteFilesPayload(transform).getSink());
}
 
Example 22
Source Project: beam   Source File: PTransformMatchersTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void writeWithRunnerDeterminedSharding() {
  ResourceId outputDirectory = LocalResources.fromString("/foo/bar", true /* isDirectory */);
  FilenamePolicy policy =
      DefaultFilenamePolicy.fromStandardParameters(
          StaticValueProvider.of(outputDirectory),
          DefaultFilenamePolicy.DEFAULT_UNWINDOWED_SHARD_TEMPLATE,
          "",
          false);
  WriteFiles<Integer, Void, Integer> write =
      WriteFiles.to(
          new FileBasedSink<Integer, Void, Integer>(
              StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(policy)) {
            @Override
            public WriteOperation<Void, Integer> createWriteOperation() {
              return null;
            }
          });
  assertThat(
      PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(write)),
      is(true));

  WriteFiles<Integer, Void, Integer> withStaticSharding = write.withNumShards(3);
  assertThat(
      PTransformMatchers.writeWithRunnerDeterminedSharding()
          .matches(appliedWrite(withStaticSharding)),
      is(false));

  WriteFiles<Integer, Void, Integer> withCustomSharding =
      write.withSharding(Sum.integersGlobally().asSingletonView());
  assertThat(
      PTransformMatchers.writeWithRunnerDeterminedSharding()
          .matches(appliedWrite(withCustomSharding)),
      is(false));
}
 
Example 23
Source Project: beam   Source File: PTransformMatchersTest.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public ResourceId windowedFilename(
    int shardNumber,
    int numShards,
    BoundedWindow window,
    PaneInfo paneInfo,
    FileBasedSink.OutputFileHints outputFileHints) {
  throw new UnsupportedOperationException("should not be called");
}
 
Example 24
Source Project: beam   Source File: DataflowRunner.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PTransformReplacement<PCollection<UserT>, WriteFilesResult<DestinationT>>
    getReplacementTransform(
        AppliedPTransform<
                PCollection<UserT>,
                WriteFilesResult<DestinationT>,
                WriteFiles<UserT, DestinationT, OutputT>>
            transform) {
  // By default, if numShards is not set WriteFiles will produce one file per bundle. In
  // streaming, there are large numbers of small bundles, resulting in many tiny files.
  // Instead we pick max workers * 2 to ensure full parallelism, but prevent too-many files.
  // (current_num_workers * 2 might be a better choice, but that value is not easily available
  // today).
  // If the user does not set either numWorkers or maxNumWorkers, default to 10 shards.
  int numShards;
  if (options.getMaxNumWorkers() > 0) {
    numShards = options.getMaxNumWorkers() * 2;
  } else if (options.getNumWorkers() > 0) {
    numShards = options.getNumWorkers() * 2;
  } else {
    numShards = DEFAULT_NUM_SHARDS;
  }

  try {
    List<PCollectionView<?>> sideInputs =
        WriteFilesTranslation.getDynamicDestinationSideInputs(transform);
    FileBasedSink sink = WriteFilesTranslation.getSink(transform);
    WriteFiles<UserT, DestinationT, OutputT> replacement =
        WriteFiles.to(sink).withSideInputs(sideInputs);
    if (WriteFilesTranslation.isWindowedWrites(transform)) {
      replacement = replacement.withWindowedWrites();
    }
    return PTransformReplacement.of(
        PTransformReplacements.getSingletonMainInput(transform),
        replacement.withNumShards(numShards));
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}
 
Example 25
Source Project: beam   Source File: WriteWithShardingFactoryTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void withNoShardingSpecifiedReturnsNewTransform() {
  ResourceId outputDirectory = LocalResources.fromString("/foo", true /* isDirectory */);

  PTransform<PCollection<Object>, WriteFilesResult<Void>> original =
      WriteFiles.to(
          new FileBasedSink<Object, Void, Object>(
              StaticValueProvider.of(outputDirectory),
              DynamicFileDestinations.constant(new FakeFilenamePolicy())) {
            @Override
            public WriteOperation<Void, Object> createWriteOperation() {
              throw new IllegalArgumentException("Should not be used");
            }
          });
  @SuppressWarnings("unchecked")
  PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of()));

  AppliedPTransform<
          PCollection<Object>,
          WriteFilesResult<Void>,
          PTransform<PCollection<Object>, WriteFilesResult<Void>>>
      originalApplication =
          AppliedPTransform.of("write", objs.expand(), Collections.emptyMap(), original, p);

  assertThat(
      factory.getReplacementTransform(originalApplication).getTransform(),
      not(equalTo((Object) original)));
}
 
Example 26
Source Project: beam   Source File: WriteWithShardingFactoryTest.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public ResourceId windowedFilename(
    int shardNumber,
    int numShards,
    BoundedWindow window,
    PaneInfo paneInfo,
    FileBasedSink.OutputFileHints outputFileHints) {
  throw new IllegalArgumentException("Should not be used");
}
 
Example 27
Source Project: nomulus   Source File: InvoicingUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns the default filename parameters for an unmappable {@code BillingEvent}.
 *
 * <p>The "failed" file should only be populated when an error occurs, which warrants further
 * investigation.
 */
static Params makeEmptyDestinationParams(String outputBucket) {
  return new Params()
      .withBaseFilename(
          FileBasedSink.convertToFileResourceIfPossible(
              String.format("%s/%s", outputBucket, "FAILURES")));
}
 
Example 28
Source Project: nomulus   Source File: InvoicingUtilsTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testEmptyDestinationParams() {
  assertThat(InvoicingUtils.makeEmptyDestinationParams("my/directory"))
      .isEqualTo(
          new Params()
              .withBaseFilename(
                  FileBasedSink.convertToFileResourceIfPossible("my/directory/FAILURES")));
}
 
Example 29
Source Project: incubator-nemo   Source File: WriteOneFilePerWindow.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public ResourceId unwindowedFilename(
  final int shardNumber, final int numShards, final FileBasedSink.OutputFileHints outputFileHints) {
  throw new UnsupportedOperationException("Unsupported.");
}
 
Example 30
Source Project: DataflowTemplates   Source File: PubsubToAvro.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *   1) Read messages from PubSub
   *   2) Window the messages into minute intervals specified by the executor.
   *   3) Output the windowed data into Avro files, one per window by default.
   */
  pipeline
      .apply(
          "Read PubSub Events",
          PubsubIO.readMessagesWithAttributes().fromTopic(options.getInputTopic()))
      .apply("Map to Archive", ParDo.of(new PubsubMessageToArchiveDoFn()))
      .apply(
          options.getWindowDuration() + " Window",
          Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration()))))

      // Apply windowed file writes. Use a NestedValueProvider because the filename
      // policy requires a resourceId generated from the input value at runtime.
      .apply(
          "Write File(s)",
          AvroIO.write(AvroPubsubMessageRecord.class)
              .to(
                  new WindowedFilenamePolicy(
                      options.getOutputDirectory(),
                      options.getOutputFilenamePrefix(),
                      options.getOutputShardTemplate(),
                      options.getOutputFilenameSuffix()))
              .withTempDirectory(NestedValueProvider.of(
                  options.getAvroTempDirectory(),
                  (SerializableFunction<String, ResourceId>) input ->
                      FileBasedSink.convertToFileResourceIfPossible(input)))
              /*.withTempDirectory(FileSystems.matchNewResource(
                  options.getAvroTempDirectory(),
                  Boolean.TRUE))
                  */
              .withWindowedWrites()
              .withNumShards(options.getNumShards()));

  // Execute the pipeline and return the result.
  return pipeline.run();
}