org.apache.beam.sdk.io.WriteFiles Java Examples

The following examples show how to use org.apache.beam.sdk.io.WriteFiles. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JdbcAvroIO.java    From dbeam with Apache License 2.0 6 votes vote down vote up
public static PTransform<PCollection<String>, WriteFilesResult<Void>> createWrite(
    String filenamePrefix, String filenameSuffix, Schema schema, JdbcAvroArgs jdbcAvroArgs) {
  filenamePrefix = filenamePrefix.replaceAll("/+$", "") + "/part";
  ValueProvider<ResourceId> prefixProvider =
      StaticValueProvider.of(FileBasedSink.convertToFileResourceIfPossible(filenamePrefix));
  FileBasedSink.FilenamePolicy filenamePolicy =
      DefaultFilenamePolicy.fromStandardParameters(
          prefixProvider, DEFAULT_SHARD_TEMPLATE, filenameSuffix, false);

  final DynamicAvroDestinations<String, Void, String> destinations =
      AvroIO.constantDestinations(
          filenamePolicy,
          schema,
          ImmutableMap.of(),
          // since Beam does not support zstandard
          CodecFactory.nullCodec(),
          SerializableFunctions.identity());
  final FileBasedSink<String, Void, String> sink =
      new JdbcAvroSink<>(prefixProvider, destinations, jdbcAvroArgs);
  return WriteFiles.to(sink);
}
 
Example #2
Source File: WriteFilesTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public Map<TupleTag<?>, PValue> getAdditionalInputs() {
  Map<TupleTag<?>, PValue> additionalInputs = new HashMap<>();
  for (Map.Entry<String, SideInput> sideInputEntry : payload.getSideInputsMap().entrySet()) {
    try {
      additionalInputs.put(
          new TupleTag<>(sideInputEntry.getKey()),
          rehydratedComponents.getPCollection(
              protoTransform.getInputsOrThrow(sideInputEntry.getKey())));
    } catch (IOException exc) {
      throw new IllegalStateException(
          String.format(
              "Could not find input with name %s for %s transform",
              sideInputEntry.getKey(), WriteFiles.class.getSimpleName()));
    }
  }
  return additionalInputs;
}
 
Example #3
Source File: WriteFilesTranslationTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testExtractionDirectFromTransform() throws Exception {
  PCollection<String> input = p.apply(Create.of("hello"));
  WriteFilesResult<Void> output = input.apply(writeFiles);

  AppliedPTransform<PCollection<String>, WriteFilesResult<Void>, WriteFiles<String, Void, String>>
      appliedPTransform =
          AppliedPTransform.of("foo", input.expand(), output.expand(), writeFiles, p);

  assertThat(
      WriteFilesTranslation.isRunnerDeterminedSharding(appliedPTransform),
      equalTo(
          writeFiles.getNumShardsProvider() == null && writeFiles.getComputeNumShards() == null));

  assertThat(
      WriteFilesTranslation.isWindowedWrites(appliedPTransform),
      equalTo(writeFiles.getWindowedWrites()));
  assertThat(
      WriteFilesTranslation.<String, Void, String>getSink(appliedPTransform),
      equalTo(writeFiles.getSink()));
}
 
Example #4
Source File: FlinkStreamingPipelineTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * {@link PTransformMatcher} which decides if {@link StreamingShardedWriteFactory} should be
 * applied.
 */
static PTransformMatcher writeFilesNeedsOverrides() {
  return application -> {
    if (WRITE_FILES_TRANSFORM_URN.equals(
        PTransformTranslation.urnForTransformOrNull(application.getTransform()))) {
      try {
        FlinkPipelineOptions options =
            application.getPipeline().getOptions().as(FlinkPipelineOptions.class);
        ShardingFunction shardingFn =
            ((WriteFiles<?, ?, ?>) application.getTransform()).getShardingFunction();
        return WriteFilesTranslation.isRunnerDeterminedSharding((AppliedPTransform) application)
            || (options.isAutoBalanceWriteFilesShardingEnabled() && shardingFn == null);
      } catch (IOException exc) {
        throw new RuntimeException(
            String.format(
                "Transform with URN %s failed to parse: %s",
                WRITE_FILES_TRANSFORM_URN, application.getTransform()),
            exc);
      }
    }
    return false;
  };
}
 
Example #5
Source File: FlinkTransformOverridesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testRunnerDeterminedSharding() {
  FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
  options.setRunner(TestFlinkRunner.class);
  options.setFlinkMaster("[auto]");
  options.setParallelism(5);

  TestPipeline p = TestPipeline.fromOptions(options);

  StreamingShardedWriteFactory<Object, Void, Object> factory =
      new StreamingShardedWriteFactory<>(p.getOptions());

  WriteFiles<Object, Void, Object> original = WriteFiles.to(new TestSink(tmpFolder.toString()));
  @SuppressWarnings("unchecked")
  PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of()));
  AppliedPTransform<PCollection<Object>, WriteFilesResult<Void>, WriteFiles<Object, Void, Object>>
      originalApplication =
          AppliedPTransform.of("writefiles", objs.expand(), Collections.emptyMap(), original, p);

  WriteFiles<Object, Void, Object> replacement =
      (WriteFiles<Object, Void, Object>)
          factory.getReplacementTransform(originalApplication).getTransform();

  assertThat(replacement, not(equalTo((Object) original)));
  assertThat(replacement.getNumShardsProvider().get(), is(10));
}
 
Example #6
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private void testStreamingWriteOverride(PipelineOptions options, int expectedNumShards) {
  TestPipeline p = TestPipeline.fromOptions(options);

  StreamingShardedWriteFactory<Object, Void, Object> factory =
      new StreamingShardedWriteFactory<>(p.getOptions());
  WriteFiles<Object, Void, Object> original = WriteFiles.to(new TestSink(tmpFolder.toString()));
  PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of()));
  AppliedPTransform<PCollection<Object>, WriteFilesResult<Void>, WriteFiles<Object, Void, Object>>
      originalApplication =
          AppliedPTransform.of("writefiles", objs.expand(), Collections.emptyMap(), original, p);

  WriteFiles<Object, Void, Object> replacement =
      (WriteFiles<Object, Void, Object>)
          factory.getReplacementTransform(originalApplication).getTransform();
  assertThat(replacement, not(equalTo((Object) original)));
  assertThat(replacement.getNumShardsProvider().get(), equalTo(expectedNumShards));

  WriteFilesResult<Void> originalResult = objs.apply(original);
  WriteFilesResult<Void> replacementResult = objs.apply(replacement);
  Map<PValue, ReplacementOutput> res =
      factory.mapOutputs(originalResult.expand(), replacementResult);
  assertEquals(1, res.size());
  assertEquals(
      originalResult.getPerDestinationOutputFilenames(),
      res.get(replacementResult.getPerDestinationOutputFilenames()).getOriginal().getValue());
}
 
Example #7
Source File: WriteWithShardingFactory.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PTransformReplacement<PCollection<InputT>, WriteFilesResult<DestinationT>>
    getReplacementTransform(
        AppliedPTransform<
                PCollection<InputT>,
                WriteFilesResult<DestinationT>,
                PTransform<PCollection<InputT>, WriteFilesResult<DestinationT>>>
            transform) {
  try {
    WriteFiles<InputT, DestinationT, ?> replacement =
        WriteFiles.to(WriteFilesTranslation.getSink(transform))
            .withSideInputs(WriteFilesTranslation.getDynamicDestinationSideInputs(transform))
            .withSharding(new LogElementShardsWithDrift<>());
    if (WriteFilesTranslation.isWindowedWrites(transform)) {
      replacement = replacement.withWindowedWrites();
    }
    return PTransformReplacement.of(
        PTransformReplacements.getSingletonMainInput(transform), replacement);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example #8
Source File: WriteFilesTranslation.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public FunctionSpec translate(
    AppliedPTransform<?, ?, WriteFiles<?, ?, ?>> transform, SdkComponents components)
    throws IOException {
  return FunctionSpec.newBuilder()
      .setUrn(getUrn(transform.getTransform()))
      .setPayload(payloadForWriteFiles(transform.getTransform(), components).toByteString())
      .build();
}
 
Example #9
Source File: WriteFilesTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Parameters(name = "{index}: {0}")
public static Iterable<WriteFiles<Object, Void, Object>> data() {
  return ImmutableList.of(
      WriteFiles.to(new DummySink()),
      WriteFiles.to(new DummySink()).withWindowedWrites(),
      WriteFiles.to(new DummySink()).withNumShards(17),
      WriteFiles.to(new DummySink()).withWindowedWrites().withNumShards(42));
}
 
Example #10
Source File: PTransformMatchersTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void writeWithRunnerDeterminedSharding() {
  ResourceId outputDirectory = LocalResources.fromString("/foo/bar", true /* isDirectory */);
  FilenamePolicy policy =
      DefaultFilenamePolicy.fromStandardParameters(
          StaticValueProvider.of(outputDirectory),
          DefaultFilenamePolicy.DEFAULT_UNWINDOWED_SHARD_TEMPLATE,
          "",
          false);
  WriteFiles<Integer, Void, Integer> write =
      WriteFiles.to(
          new FileBasedSink<Integer, Void, Integer>(
              StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(policy)) {
            @Override
            public WriteOperation<Void, Integer> createWriteOperation() {
              return null;
            }
          });
  assertThat(
      PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(write)),
      is(true));

  WriteFiles<Integer, Void, Integer> withStaticSharding = write.withNumShards(3);
  assertThat(
      PTransformMatchers.writeWithRunnerDeterminedSharding()
          .matches(appliedWrite(withStaticSharding)),
      is(false));

  WriteFiles<Integer, Void, Integer> withCustomSharding =
      write.withSharding(Sum.integersGlobally().asSingletonView());
  assertThat(
      PTransformMatchers.writeWithRunnerDeterminedSharding()
          .matches(appliedWrite(withCustomSharding)),
      is(false));
}
 
Example #11
Source File: DataflowRunner.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PTransformReplacement<PCollection<UserT>, WriteFilesResult<DestinationT>>
    getReplacementTransform(
        AppliedPTransform<
                PCollection<UserT>,
                WriteFilesResult<DestinationT>,
                WriteFiles<UserT, DestinationT, OutputT>>
            transform) {
  // By default, if numShards is not set WriteFiles will produce one file per bundle. In
  // streaming, there are large numbers of small bundles, resulting in many tiny files.
  // Instead we pick max workers * 2 to ensure full parallelism, but prevent too-many files.
  // (current_num_workers * 2 might be a better choice, but that value is not easily available
  // today).
  // If the user does not set either numWorkers or maxNumWorkers, default to 10 shards.
  int numShards;
  if (options.getMaxNumWorkers() > 0) {
    numShards = options.getMaxNumWorkers() * 2;
  } else if (options.getNumWorkers() > 0) {
    numShards = options.getNumWorkers() * 2;
  } else {
    numShards = DEFAULT_NUM_SHARDS;
  }

  try {
    List<PCollectionView<?>> sideInputs =
        WriteFilesTranslation.getDynamicDestinationSideInputs(transform);
    FileBasedSink sink = WriteFilesTranslation.getSink(transform);
    WriteFiles<UserT, DestinationT, OutputT> replacement =
        WriteFiles.to(sink).withSideInputs(sideInputs);
    if (WriteFilesTranslation.isWindowedWrites(transform)) {
      replacement = replacement.withWindowedWrites();
    }
    return PTransformReplacement.of(
        PTransformReplacements.getSingletonMainInput(transform),
        replacement.withNumShards(numShards));
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}
 
Example #12
Source File: WriteWithShardingFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void withNoShardingSpecifiedReturnsNewTransform() {
  ResourceId outputDirectory = LocalResources.fromString("/foo", true /* isDirectory */);

  PTransform<PCollection<Object>, WriteFilesResult<Void>> original =
      WriteFiles.to(
          new FileBasedSink<Object, Void, Object>(
              StaticValueProvider.of(outputDirectory),
              DynamicFileDestinations.constant(new FakeFilenamePolicy())) {
            @Override
            public WriteOperation<Void, Object> createWriteOperation() {
              throw new IllegalArgumentException("Should not be used");
            }
          });
  @SuppressWarnings("unchecked")
  PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of()));

  AppliedPTransform<
          PCollection<Object>,
          WriteFilesResult<Void>,
          PTransform<PCollection<Object>, WriteFilesResult<Void>>>
      originalApplication =
          AppliedPTransform.of("write", objs.expand(), Collections.emptyMap(), original, p);

  assertThat(
      factory.getReplacementTransform(originalApplication).getTransform(),
      not(equalTo((Object) original)));
}
 
Example #13
Source File: WriteFilesTranslation.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public String getUrn(WriteFiles<?, ?, ?> transform) {
  return WRITE_FILES_TRANSFORM_URN;
}
 
Example #14
Source File: WriteFilesTranslation.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public Map<Class<? extends PTransform>, TransformPayloadTranslator>
    getTransformPayloadTranslators() {
  return Collections.singletonMap(WriteFiles.CONCRETE_CLASS, new WriteFilesTranslator());
}
 
Example #15
Source File: PTransformMatchersTest.java    From beam with Apache License 2.0 4 votes vote down vote up
private AppliedPTransform<?, ?, ?> appliedWrite(WriteFiles<Integer, Void, Integer> write) {
  return AppliedPTransform.of(
      "WriteFiles", Collections.emptyMap(), Collections.emptyMap(), write, p);
}
 
Example #16
Source File: FlinkStreamingPipelineTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PTransformReplacement<PCollection<UserT>, WriteFilesResult<DestinationT>>
    getReplacementTransform(
        AppliedPTransform<
                PCollection<UserT>,
                WriteFilesResult<DestinationT>,
                WriteFiles<UserT, DestinationT, OutputT>>
            transform) {
  // By default, if numShards is not set WriteFiles will produce one file per bundle. In
  // streaming, there are large numbers of small bundles, resulting in many tiny files.
  // Instead we pick parallelism * 2 to ensure full parallelism, but prevent too-many files.
  Integer jobParallelism = options.getParallelism();

  Preconditions.checkArgument(
      jobParallelism > 0,
      "Parallelism of a job should be greater than 0. Currently set: %s",
      jobParallelism);
  int numShards = jobParallelism * 2;

  try {
    List<PCollectionView<?>> sideInputs =
        WriteFilesTranslation.getDynamicDestinationSideInputs(transform);
    FileBasedSink sink = WriteFilesTranslation.getSink(transform);

    @SuppressWarnings("unchecked")
    WriteFiles<UserT, DestinationT, OutputT> replacement =
        WriteFiles.to(sink).withSideInputs(sideInputs);
    if (WriteFilesTranslation.isWindowedWrites(transform)) {
      replacement = replacement.withWindowedWrites();
    }

    if (WriteFilesTranslation.isRunnerDeterminedSharding(transform)) {
      replacement = replacement.withNumShards(numShards);
    } else {
      if (transform.getTransform().getNumShardsProvider() != null) {
        replacement =
            replacement.withNumShards(transform.getTransform().getNumShardsProvider());
      }
      if (transform.getTransform().getComputeNumShards() != null) {
        replacement = replacement.withSharding(transform.getTransform().getComputeNumShards());
      }
    }

    if (options.isAutoBalanceWriteFilesShardingEnabled()) {

      replacement =
          replacement.withShardingFunction(
              new FlinkAutoBalancedShardKeyShardingFunction<>(
                  jobParallelism,
                  options.getMaxParallelism(),
                  sink.getDynamicDestinations().getDestinationCoder()));
    }

    return PTransformReplacement.of(
        PTransformReplacements.getSingletonMainInput(transform), replacement);
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}