org.apache.beam.sdk.io.WriteFiles Java Examples
The following examples show how to use
org.apache.beam.sdk.io.WriteFiles.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JdbcAvroIO.java From dbeam with Apache License 2.0 | 6 votes |
public static PTransform<PCollection<String>, WriteFilesResult<Void>> createWrite( String filenamePrefix, String filenameSuffix, Schema schema, JdbcAvroArgs jdbcAvroArgs) { filenamePrefix = filenamePrefix.replaceAll("/+$", "") + "/part"; ValueProvider<ResourceId> prefixProvider = StaticValueProvider.of(FileBasedSink.convertToFileResourceIfPossible(filenamePrefix)); FileBasedSink.FilenamePolicy filenamePolicy = DefaultFilenamePolicy.fromStandardParameters( prefixProvider, DEFAULT_SHARD_TEMPLATE, filenameSuffix, false); final DynamicAvroDestinations<String, Void, String> destinations = AvroIO.constantDestinations( filenamePolicy, schema, ImmutableMap.of(), // since Beam does not support zstandard CodecFactory.nullCodec(), SerializableFunctions.identity()); final FileBasedSink<String, Void, String> sink = new JdbcAvroSink<>(prefixProvider, destinations, jdbcAvroArgs); return WriteFiles.to(sink); }
Example #2
Source File: WriteFilesTranslation.java From beam with Apache License 2.0 | 6 votes |
@Override public Map<TupleTag<?>, PValue> getAdditionalInputs() { Map<TupleTag<?>, PValue> additionalInputs = new HashMap<>(); for (Map.Entry<String, SideInput> sideInputEntry : payload.getSideInputsMap().entrySet()) { try { additionalInputs.put( new TupleTag<>(sideInputEntry.getKey()), rehydratedComponents.getPCollection( protoTransform.getInputsOrThrow(sideInputEntry.getKey()))); } catch (IOException exc) { throw new IllegalStateException( String.format( "Could not find input with name %s for %s transform", sideInputEntry.getKey(), WriteFiles.class.getSimpleName())); } } return additionalInputs; }
Example #3
Source File: WriteFilesTranslationTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testExtractionDirectFromTransform() throws Exception { PCollection<String> input = p.apply(Create.of("hello")); WriteFilesResult<Void> output = input.apply(writeFiles); AppliedPTransform<PCollection<String>, WriteFilesResult<Void>, WriteFiles<String, Void, String>> appliedPTransform = AppliedPTransform.of("foo", input.expand(), output.expand(), writeFiles, p); assertThat( WriteFilesTranslation.isRunnerDeterminedSharding(appliedPTransform), equalTo( writeFiles.getNumShardsProvider() == null && writeFiles.getComputeNumShards() == null)); assertThat( WriteFilesTranslation.isWindowedWrites(appliedPTransform), equalTo(writeFiles.getWindowedWrites())); assertThat( WriteFilesTranslation.<String, Void, String>getSink(appliedPTransform), equalTo(writeFiles.getSink())); }
Example #4
Source File: FlinkStreamingPipelineTranslator.java From beam with Apache License 2.0 | 6 votes |
/** * {@link PTransformMatcher} which decides if {@link StreamingShardedWriteFactory} should be * applied. */ static PTransformMatcher writeFilesNeedsOverrides() { return application -> { if (WRITE_FILES_TRANSFORM_URN.equals( PTransformTranslation.urnForTransformOrNull(application.getTransform()))) { try { FlinkPipelineOptions options = application.getPipeline().getOptions().as(FlinkPipelineOptions.class); ShardingFunction shardingFn = ((WriteFiles<?, ?, ?>) application.getTransform()).getShardingFunction(); return WriteFilesTranslation.isRunnerDeterminedSharding((AppliedPTransform) application) || (options.isAutoBalanceWriteFilesShardingEnabled() && shardingFn == null); } catch (IOException exc) { throw new RuntimeException( String.format( "Transform with URN %s failed to parse: %s", WRITE_FILES_TRANSFORM_URN, application.getTransform()), exc); } } return false; }; }
Example #5
Source File: FlinkTransformOverridesTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testRunnerDeterminedSharding() { FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("[auto]"); options.setParallelism(5); TestPipeline p = TestPipeline.fromOptions(options); StreamingShardedWriteFactory<Object, Void, Object> factory = new StreamingShardedWriteFactory<>(p.getOptions()); WriteFiles<Object, Void, Object> original = WriteFiles.to(new TestSink(tmpFolder.toString())); @SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of())); AppliedPTransform<PCollection<Object>, WriteFilesResult<Void>, WriteFiles<Object, Void, Object>> originalApplication = AppliedPTransform.of("writefiles", objs.expand(), Collections.emptyMap(), original, p); WriteFiles<Object, Void, Object> replacement = (WriteFiles<Object, Void, Object>) factory.getReplacementTransform(originalApplication).getTransform(); assertThat(replacement, not(equalTo((Object) original))); assertThat(replacement.getNumShardsProvider().get(), is(10)); }
Example #6
Source File: DataflowRunnerTest.java From beam with Apache License 2.0 | 6 votes |
private void testStreamingWriteOverride(PipelineOptions options, int expectedNumShards) { TestPipeline p = TestPipeline.fromOptions(options); StreamingShardedWriteFactory<Object, Void, Object> factory = new StreamingShardedWriteFactory<>(p.getOptions()); WriteFiles<Object, Void, Object> original = WriteFiles.to(new TestSink(tmpFolder.toString())); PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of())); AppliedPTransform<PCollection<Object>, WriteFilesResult<Void>, WriteFiles<Object, Void, Object>> originalApplication = AppliedPTransform.of("writefiles", objs.expand(), Collections.emptyMap(), original, p); WriteFiles<Object, Void, Object> replacement = (WriteFiles<Object, Void, Object>) factory.getReplacementTransform(originalApplication).getTransform(); assertThat(replacement, not(equalTo((Object) original))); assertThat(replacement.getNumShardsProvider().get(), equalTo(expectedNumShards)); WriteFilesResult<Void> originalResult = objs.apply(original); WriteFilesResult<Void> replacementResult = objs.apply(replacement); Map<PValue, ReplacementOutput> res = factory.mapOutputs(originalResult.expand(), replacementResult); assertEquals(1, res.size()); assertEquals( originalResult.getPerDestinationOutputFilenames(), res.get(replacementResult.getPerDestinationOutputFilenames()).getOriginal().getValue()); }
Example #7
Source File: WriteWithShardingFactory.java From beam with Apache License 2.0 | 6 votes |
@Override public PTransformReplacement<PCollection<InputT>, WriteFilesResult<DestinationT>> getReplacementTransform( AppliedPTransform< PCollection<InputT>, WriteFilesResult<DestinationT>, PTransform<PCollection<InputT>, WriteFilesResult<DestinationT>>> transform) { try { WriteFiles<InputT, DestinationT, ?> replacement = WriteFiles.to(WriteFilesTranslation.getSink(transform)) .withSideInputs(WriteFilesTranslation.getDynamicDestinationSideInputs(transform)) .withSharding(new LogElementShardsWithDrift<>()); if (WriteFilesTranslation.isWindowedWrites(transform)) { replacement = replacement.withWindowedWrites(); } return PTransformReplacement.of( PTransformReplacements.getSingletonMainInput(transform), replacement); } catch (IOException e) { throw new RuntimeException(e); } }
Example #8
Source File: WriteFilesTranslation.java From beam with Apache License 2.0 | 5 votes |
@Override public FunctionSpec translate( AppliedPTransform<?, ?, WriteFiles<?, ?, ?>> transform, SdkComponents components) throws IOException { return FunctionSpec.newBuilder() .setUrn(getUrn(transform.getTransform())) .setPayload(payloadForWriteFiles(transform.getTransform(), components).toByteString()) .build(); }
Example #9
Source File: WriteFilesTranslationTest.java From beam with Apache License 2.0 | 5 votes |
@Parameters(name = "{index}: {0}") public static Iterable<WriteFiles<Object, Void, Object>> data() { return ImmutableList.of( WriteFiles.to(new DummySink()), WriteFiles.to(new DummySink()).withWindowedWrites(), WriteFiles.to(new DummySink()).withNumShards(17), WriteFiles.to(new DummySink()).withWindowedWrites().withNumShards(42)); }
Example #10
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void writeWithRunnerDeterminedSharding() { ResourceId outputDirectory = LocalResources.fromString("/foo/bar", true /* isDirectory */); FilenamePolicy policy = DefaultFilenamePolicy.fromStandardParameters( StaticValueProvider.of(outputDirectory), DefaultFilenamePolicy.DEFAULT_UNWINDOWED_SHARD_TEMPLATE, "", false); WriteFiles<Integer, Void, Integer> write = WriteFiles.to( new FileBasedSink<Integer, Void, Integer>( StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(policy)) { @Override public WriteOperation<Void, Integer> createWriteOperation() { return null; } }); assertThat( PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(write)), is(true)); WriteFiles<Integer, Void, Integer> withStaticSharding = write.withNumShards(3); assertThat( PTransformMatchers.writeWithRunnerDeterminedSharding() .matches(appliedWrite(withStaticSharding)), is(false)); WriteFiles<Integer, Void, Integer> withCustomSharding = write.withSharding(Sum.integersGlobally().asSingletonView()); assertThat( PTransformMatchers.writeWithRunnerDeterminedSharding() .matches(appliedWrite(withCustomSharding)), is(false)); }
Example #11
Source File: DataflowRunner.java From beam with Apache License 2.0 | 5 votes |
@Override public PTransformReplacement<PCollection<UserT>, WriteFilesResult<DestinationT>> getReplacementTransform( AppliedPTransform< PCollection<UserT>, WriteFilesResult<DestinationT>, WriteFiles<UserT, DestinationT, OutputT>> transform) { // By default, if numShards is not set WriteFiles will produce one file per bundle. In // streaming, there are large numbers of small bundles, resulting in many tiny files. // Instead we pick max workers * 2 to ensure full parallelism, but prevent too-many files. // (current_num_workers * 2 might be a better choice, but that value is not easily available // today). // If the user does not set either numWorkers or maxNumWorkers, default to 10 shards. int numShards; if (options.getMaxNumWorkers() > 0) { numShards = options.getMaxNumWorkers() * 2; } else if (options.getNumWorkers() > 0) { numShards = options.getNumWorkers() * 2; } else { numShards = DEFAULT_NUM_SHARDS; } try { List<PCollectionView<?>> sideInputs = WriteFilesTranslation.getDynamicDestinationSideInputs(transform); FileBasedSink sink = WriteFilesTranslation.getSink(transform); WriteFiles<UserT, DestinationT, OutputT> replacement = WriteFiles.to(sink).withSideInputs(sideInputs); if (WriteFilesTranslation.isWindowedWrites(transform)) { replacement = replacement.withWindowedWrites(); } return PTransformReplacement.of( PTransformReplacements.getSingletonMainInput(transform), replacement.withNumShards(numShards)); } catch (Exception e) { throw new RuntimeException(e); } }
Example #12
Source File: WriteWithShardingFactoryTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void withNoShardingSpecifiedReturnsNewTransform() { ResourceId outputDirectory = LocalResources.fromString("/foo", true /* isDirectory */); PTransform<PCollection<Object>, WriteFilesResult<Void>> original = WriteFiles.to( new FileBasedSink<Object, Void, Object>( StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(new FakeFilenamePolicy())) { @Override public WriteOperation<Void, Object> createWriteOperation() { throw new IllegalArgumentException("Should not be used"); } }); @SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of())); AppliedPTransform< PCollection<Object>, WriteFilesResult<Void>, PTransform<PCollection<Object>, WriteFilesResult<Void>>> originalApplication = AppliedPTransform.of("write", objs.expand(), Collections.emptyMap(), original, p); assertThat( factory.getReplacementTransform(originalApplication).getTransform(), not(equalTo((Object) original))); }
Example #13
Source File: WriteFilesTranslation.java From beam with Apache License 2.0 | 4 votes |
@Override public String getUrn(WriteFiles<?, ?, ?> transform) { return WRITE_FILES_TRANSFORM_URN; }
Example #14
Source File: WriteFilesTranslation.java From beam with Apache License 2.0 | 4 votes |
@Override public Map<Class<? extends PTransform>, TransformPayloadTranslator> getTransformPayloadTranslators() { return Collections.singletonMap(WriteFiles.CONCRETE_CLASS, new WriteFilesTranslator()); }
Example #15
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 4 votes |
private AppliedPTransform<?, ?, ?> appliedWrite(WriteFiles<Integer, Void, Integer> write) { return AppliedPTransform.of( "WriteFiles", Collections.emptyMap(), Collections.emptyMap(), write, p); }
Example #16
Source File: FlinkStreamingPipelineTranslator.java From beam with Apache License 2.0 | 4 votes |
@Override public PTransformReplacement<PCollection<UserT>, WriteFilesResult<DestinationT>> getReplacementTransform( AppliedPTransform< PCollection<UserT>, WriteFilesResult<DestinationT>, WriteFiles<UserT, DestinationT, OutputT>> transform) { // By default, if numShards is not set WriteFiles will produce one file per bundle. In // streaming, there are large numbers of small bundles, resulting in many tiny files. // Instead we pick parallelism * 2 to ensure full parallelism, but prevent too-many files. Integer jobParallelism = options.getParallelism(); Preconditions.checkArgument( jobParallelism > 0, "Parallelism of a job should be greater than 0. Currently set: %s", jobParallelism); int numShards = jobParallelism * 2; try { List<PCollectionView<?>> sideInputs = WriteFilesTranslation.getDynamicDestinationSideInputs(transform); FileBasedSink sink = WriteFilesTranslation.getSink(transform); @SuppressWarnings("unchecked") WriteFiles<UserT, DestinationT, OutputT> replacement = WriteFiles.to(sink).withSideInputs(sideInputs); if (WriteFilesTranslation.isWindowedWrites(transform)) { replacement = replacement.withWindowedWrites(); } if (WriteFilesTranslation.isRunnerDeterminedSharding(transform)) { replacement = replacement.withNumShards(numShards); } else { if (transform.getTransform().getNumShardsProvider() != null) { replacement = replacement.withNumShards(transform.getTransform().getNumShardsProvider()); } if (transform.getTransform().getComputeNumShards() != null) { replacement = replacement.withSharding(transform.getTransform().getComputeNumShards()); } } if (options.isAutoBalanceWriteFilesShardingEnabled()) { replacement = replacement.withShardingFunction( new FlinkAutoBalancedShardKeyShardingFunction<>( jobParallelism, options.getMaxParallelism(), sink.getDynamicDestinations().getDestinationCoder())); } return PTransformReplacement.of( PTransformReplacements.getSingletonMainInput(transform), replacement); } catch (Exception e) { throw new RuntimeException(e); } }