org.apache.beam.sdk.transforms.DoFn Java Examples
The following examples show how to use
org.apache.beam.sdk.transforms.DoFn.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HllCount.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Long> expand(PCollection<byte[]> input) { return input.apply( ParDo.of( new DoFn<byte[], Long>() { @ProcessElement public void processElement( @Element byte[] sketch, OutputReceiver<Long> receiver) { if (sketch == null) { LOG.warn( "Received a null and treated it as an empty sketch. " + "Consider replacing nulls with empty byte arrays (byte[0]) " + "in upstream transforms for better space-efficiency and safety."); receiver.output(0L); } else if (sketch.length == 0) { receiver.output(0L); } else { receiver.output(HyperLogLogPlusPlus.forProto(sketch).result()); } } })); }
Example #2
Source File: SelectEvent.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Row> expand(PCollection<Event> input) { if (!input.hasSchema()) { throw new RuntimeException("Input PCollection must have a schema!"); } int index = getNestedIndex(input.getSchema()); return input .apply( ParDo.of( new DoFn<Event, Row>() { @ProcessElement public void processElement(@Element Row row, OutputReceiver<Row> o) { o.output(row.getRow(index)); } })) .setRowSchema(input.getSchema().getField(index).getType().getRowSchema()); }
Example #3
Source File: ByteBuddyOnTimerInvokerFactory.java From beam with Apache License 2.0 | 6 votes |
@Override public <InputT, OutputT> OnTimerInvoker<InputT, OutputT> forTimer( DoFn<InputT, OutputT> fn, String timerId) { @SuppressWarnings("unchecked") Class<? extends DoFn<?, ?>> fnClass = (Class<? extends DoFn<?, ?>>) fn.getClass(); try { OnTimerMethodSpecifier onTimerMethodSpecifier = OnTimerMethodSpecifier.forClassAndTimerId(fnClass, timerId); Constructor<?> constructor = constructorCache.get(onTimerMethodSpecifier); return (OnTimerInvoker<InputT, OutputT>) constructor.newInstance(fn); } catch (InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | SecurityException | ExecutionException e) { throw new RuntimeException( String.format( "Unable to construct @%s invoker for %s", OnTimer.class.getSimpleName(), fn.getClass().getName()), e); } }
Example #4
Source File: StatefulDoFnRunnerTest.java From beam with Apache License 2.0 | 6 votes |
private DoFnRunner<KV<String, Integer>, Integer> getDoFnRunner( DoFn<KV<String, Integer>, Integer> fn, @Nullable OutputManager outputManager) { return new SimpleDoFnRunner<>( null, fn, NullSideInputReader.empty(), MoreObjects.firstNonNull(outputManager, discardingOutputManager()), outputTag, Collections.emptyList(), mockStepContext, null, Collections.emptyMap(), WINDOWING_STRATEGY, DoFnSchemaInformation.create(), Collections.emptyMap()); }
Example #5
Source File: Group.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Row> expand(PCollection<InputT> input) { Schema schema = input.getSchema(); Schema keySchema = getKeySchema(schema); Schema outputSchema = Schema.builder() .addRowField(getKeyField(), keySchema) .addIterableField(getValueField(), FieldType.row(schema)) .build(); return input .apply("ToKvs", getToKvs()) .apply( "ToRow", ParDo.of( new DoFn<KV<Row, Iterable<Row>>, Row>() { @ProcessElement public void process(@Element KV<Row, Iterable<Row>> e, OutputReceiver<Row> o) { o.output( Row.withSchema(outputSchema) .attachValues(Lists.newArrayList(e.getKey(), e.getValue()))); } })) .setRowSchema(outputSchema); }
Example #6
Source File: BatchStatefulParDoOverrides.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollectionTuple expand(PCollection<KV<K, InputT>> input) { DoFn<KV<K, InputT>, OutputT> fn = originalParDo.getFn(); verifyFnIsStateful(fn); DataflowRunner.verifyDoFnSupportedBatch(fn); DataflowRunner.verifyStateSupportForWindowingStrategy(input.getWindowingStrategy()); if (isFnApi) { return input.apply(Reshuffle.of()).apply(originalParDo); } PTransform< PCollection<? extends KV<K, Iterable<KV<Instant, WindowedValue<KV<K, InputT>>>>>>, PCollectionTuple> statefulParDo = ParDo.of(new BatchStatefulDoFn<>(fn)) .withSideInputs(originalParDo.getSideInputs()) .withOutputTags( originalParDo.getMainOutputTag(), originalParDo.getAdditionalOutputTags()); return input.apply(new GbkBeforeStatefulParDo<>()).apply(statefulParDo); }
Example #7
Source File: CsvImport.java From cloud-bigtable-examples with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(DoFn<String, Mutation>.ProcessContext c) throws Exception { try { String[] headers = c.getPipelineOptions().as(BigtableCsvOptions.class).getHeaders() .split(","); String[] values = c.element().split(","); Preconditions.checkArgument(headers.length == values.length); byte[] rowkey = Bytes.toBytes(values[0]); byte[][] headerBytes = new byte[headers.length][]; for (int i = 0; i < headers.length; i++) { headerBytes[i] = Bytes.toBytes(headers[i]); } Put row = new Put(rowkey); long timestamp = System.currentTimeMillis(); for (int i = 1; i < values.length; i++) { row.addColumn(FAMILY, headerBytes[i], timestamp, Bytes.toBytes(values[i])); } c.output(row); } catch (Exception e) { LOG.error("Failed to process input {}", c.element(), e); throw e; } }
Example #8
Source File: ByteBuddyDoFnInvokerFactory.java From beam with Apache License 2.0 | 6 votes |
@Override public ByteCodeAppender appender(final Target implementationTarget) { return (methodVisitor, implementationContext, instrumentedMethod) -> { StackManipulation.Size size = new StackManipulation.Compound( // Load the this reference MethodVariableAccess.REFERENCE.loadFrom(0), // Load the delegate argument MethodVariableAccess.REFERENCE.loadFrom(1), // Invoke the super constructor (default constructor of Object) MethodInvocation.invoke( new TypeDescription.ForLoadedType(clazz) .getDeclaredMethods() .filter( ElementMatchers.isConstructor() .and(ElementMatchers.takesArguments(DoFn.class))) .getOnly()), // Return void. MethodReturn.VOID) .apply(methodVisitor, implementationContext); return new ByteCodeAppender.Size(size.getMaximalSize(), instrumentedMethod.getStackSize()); }; }
Example #9
Source File: GroupByKeyAndWindowDoFnTransform.java From incubator-nemo with Apache License 2.0 | 6 votes |
/** * This creates a new DoFn that groups elements by key and window. * * @param doFn original doFn. * @return GroupAlsoByWindowViaWindowSetNewDoFn */ @Override protected DoFn wrapDoFn(final DoFn doFn) { final Map<K, StateAndTimerForKey> map = new HashMap<>(); this.inMemoryStateInternalsFactory = new InMemoryStateInternalsFactory(map); this.inMemoryTimerInternalsFactory = new InMemoryTimerInternalsFactory(map); // This function performs group by key and window operation return GroupAlsoByWindowViaWindowSetNewDoFn.create( getWindowingStrategy(), inMemoryStateInternalsFactory, inMemoryTimerInternalsFactory, null, // GBK has no sideinput. reduceFn, getOutputManager(), getMainOutputTag()); }
Example #10
Source File: TestBoundedTable.java From beam with Apache License 2.0 | 6 votes |
@Override public POutput buildIOWriter(PCollection<Row> input) { input.apply( ParDo.of( new DoFn<Row, Void>() { @ProcessElement public void processElement(ProcessContext c) { CONTENT.add(c.element()); } @Teardown public void close() { CONTENT.clear(); } })); return PDone.in(input.getPipeline()); }
Example #11
Source File: DoFnSignaturesTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testTimerParameterDuplicate() throws Exception { thrown.expect(IllegalArgumentException.class); thrown.expectMessage("duplicate"); thrown.expectMessage("my-id"); thrown.expectMessage("myProcessElement"); thrown.expectMessage("index 2"); thrown.expectMessage(not(mentionsState())); DoFnSignatures.getSignature( new DoFn<KV<String, Integer>, Long>() { @TimerId("my-id") private final TimerSpec myfield = TimerSpecs.timer(TimeDomain.PROCESSING_TIME); @ProcessElement public void myProcessElement( ProcessContext context, @TimerId("my-id") Timer one, @TimerId("my-id") Timer two) {} @OnTimer("my-id") public void onWhatever() {} }.getClass()); }
Example #12
Source File: LocalSpannerIO.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@DoFn.ProcessElement public void processElement(ProcessContext c) { MutationGroup mg = c.element(); if (mg.primary().getOperation() == Op.DELETE && !isPointDelete(mg.primary())) { // Ranged deletes are not batchable. c.output(unbatchableMutationsTag, Arrays.asList(mg)); unBatchableMutationGroupsCounter.inc(); return; } SpannerSchema spannerSchema = c.sideInput(schemaView); long groupSize = MutationSizeEstimator.sizeOf(mg); long groupCells = MutationCellCounter.countOf(spannerSchema, mg); long groupRows = Iterables.size(mg); if (groupSize >= batchSizeBytes || groupCells >= maxNumMutations || groupRows >= maxNumRows) { c.output(unbatchableMutationsTag, Arrays.asList(mg)); unBatchableMutationGroupsCounter.inc(); } else { c.output(mg); batchableMutationGroupsCounter.inc(); } }
Example #13
Source File: DoFnSignaturesTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testTimerIdNoCallback() throws Exception { thrown.expect(IllegalArgumentException.class); thrown.expectMessage("No callback registered"); thrown.expectMessage("my-id"); thrown.expectMessage(not(mentionsState())); thrown.expectMessage(mentionsTimers()); DoFnSignatures.getSignature( new DoFn<KV<String, Integer>, Long>() { @TimerId("my-id") private final TimerSpec myfield1 = TimerSpecs.timer(TimeDomain.EVENT_TIME); @ProcessElement public void foo(ProcessContext context) {} }.getClass()); }
Example #14
Source File: DoFnSignaturesTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testTimerIdWithWrongType() throws Exception { thrown.expect(IllegalArgumentException.class); thrown.expectMessage("TimerId"); thrown.expectMessage("TimerSpec"); thrown.expectMessage("bizzle"); thrown.expectMessage(not(mentionsState())); DoFnSignatures.getSignature( new DoFn<String, String>() { @TimerId("foo") private final String bizzle = "bazzle"; @ProcessElement public void foo(ProcessContext context) {} }.getClass()); }
Example #15
Source File: DoFnSignaturesSplittableDoFnTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testGetWatermarkEstimatorStateCoderReturnsWrongType() throws Exception { class BadFn extends DoFn<Integer, String> { @ProcessElement public void process( ProcessContext context, RestrictionTracker<SomeRestriction, Void> tracker) {} @GetInitialRestriction public SomeRestriction getInitialRestriction(@Element Integer element) { return null; } @GetWatermarkEstimatorStateCoder public KvCoder getWatermarkEstimatorStateCoder() { return null; } } thrown.expectMessage( "getWatermarkEstimatorStateCoder() returns KvCoder which is not a subtype of Coder<Void>"); DoFnSignatures.getSignature(BadFn.class); }
Example #16
Source File: DoFnSignaturesSplittableDoFnTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testUnsplittableIsBounded() throws Exception { class UnsplittableFn extends DoFn<Integer, String> { @ProcessElement public void process(ProcessContext context) {} } assertEquals( PCollection.IsBounded.BOUNDED, DoFnSignatures.getSignature(UnsplittableFn.class).isBoundedPerElement()); }
Example #17
Source File: CombineShardsFn.java From dataflow-java with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(DoFn<String, String>.ProcessContext c) throws Exception { final String result = combineShards( c.getPipelineOptions().as(Options.class), c.element(), c.sideInput(shards), c.sideInput(eofContents)); c.output(result); }
Example #18
Source File: ParDoTranslation.java From beam with Apache License 2.0 | 5 votes |
public static DoFnWithExecutionInformation doFnWithExecutionInformationFromProto( FunctionSpec fnSpec) { checkArgument( fnSpec.getUrn().equals(CUSTOM_JAVA_DO_FN_URN), "Expected %s to be %s with URN %s, but URN was %s", DoFn.class.getSimpleName(), FunctionSpec.class.getSimpleName(), CUSTOM_JAVA_DO_FN_URN, fnSpec.getUrn()); byte[] serializedFn = fnSpec.getPayload().toByteArray(); return (DoFnWithExecutionInformation) SerializableUtils.deserializeFromByteArray(serializedFn, "Custom DoFn With Execution Info"); }
Example #19
Source File: BeamAggregationRel.java From beam with Apache License 2.0 | 5 votes |
static DoFn<Row, Row> mergeRecord( Schema outputSchema, int windowStartFieldIndex, boolean ignoreValues, boolean verifyRowValues) { return new DoFn<Row, Row>() { @ProcessElement public void processElement( @Element Row kvRow, BoundedWindow window, OutputReceiver<Row> o) { int capacity = kvRow.getRow(0).getFieldCount() + (!ignoreValues ? kvRow.getRow(1).getFieldCount() : 0); List<Object> fieldValues = Lists.newArrayListWithCapacity(capacity); fieldValues.addAll(kvRow.getRow(0).getBaseValues()); if (!ignoreValues) { fieldValues.addAll(kvRow.getRow(1).getBaseValues()); } if (windowStartFieldIndex != -1) { fieldValues.add(windowStartFieldIndex, ((IntervalWindow) window).start()); } Row row = verifyRowValues ? Row.withSchema(outputSchema).addValues(fieldValues).build() : Row.withSchema(outputSchema).attachValues(fieldValues); o.output(row); } }; }
Example #20
Source File: PTransformMatchers.java From beam with Apache License 2.0 | 5 votes |
/** * A {@link PTransformMatcher} that matches a {@link ParDo.MultiOutput} containing a {@link DoFn} * that uses state or timers, as specified by {@link DoFnSignature#usesState()} and {@link * DoFnSignature#usesTimers()}. */ public static PTransformMatcher stateOrTimerParDoMulti() { return new PTransformMatcher() { @Override public boolean matches(AppliedPTransform<?, ?, ?> application) { PTransform<?, ?> transform = application.getTransform(); if (transform instanceof ParDo.MultiOutput) { DoFn<?, ?> fn = ((ParDo.MultiOutput<?, ?>) transform).getFn(); DoFnSignature signature = DoFnSignatures.signatureForDoFn(fn); return signature.usesState() || signature.usesTimers(); } return false; } @Override public String toString() { return MoreObjects.toStringHelper("StateOrTimerParDoMultiMatcher").toString(); } }; }
Example #21
Source File: GetReferencesFromHeaderFn.java From dataflow-java with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(DoFn<HeaderInfo, String>.ProcessContext c) throws Exception { final SAMFileHeader header = c.element().header; for (SAMSequenceRecord sequence : header.getSequenceDictionary().getSequences()) { c.output(sequence.getSequenceName()); } LOG.info("Processed " + header.getSequenceDictionary().size() + " references"); }
Example #22
Source File: DoFnSignaturesTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testPipelineOptionsParameter() throws Exception { DoFnSignature sig = DoFnSignatures.getSignature( new DoFn<String, String>() { @ProcessElement public void process(ProcessContext c, PipelineOptions options) {} }.getClass()); assertThat( sig.processElement().extraParameters(), Matchers.hasItem(instanceOf(Parameter.PipelineOptionsParameter.class))); }
Example #23
Source File: BigQueryInputRuntime.java From components with Apache License 2.0 | 5 votes |
@DoFn.ProcessElement public void processElement(ProcessContext c) throws IOException { TableRow row = c.element(); if (row == null) { return; } if (converter == null) { converter = new BigQueryTableRowIndexedRecordConverter(); converter.setSchema(new Schema.Parser().parse(schemaStr)); } c.output(converter.convertToAvro(row)); }
Example #24
Source File: DIBatchSimulationTest.java From component-runtime with Apache License 2.0 | 5 votes |
@Override public PDone expand(final PCollection<org.talend.sdk.component.api.record.Record> input) { input.apply(ParDo.of(new DoFn<org.talend.sdk.component.api.record.Record, Void>() { @ProcessElement public void onElement(final ProcessContext context) { RECORDS.add(context.element()); } })); return PDone.in(input.getPipeline()); }
Example #25
Source File: DoFnSignaturesTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testWrongTimestampType() throws Exception { thrown.expect(IllegalArgumentException.class); thrown.expectMessage("@Timestamp argument must have type org.joda.time.Instant"); DoFnSignature sig = DoFnSignatures.getSignature( new DoFn<String, String>() { @ProcessElement public void process(@Timestamp String timestamp) {} }.getClass()); }
Example #26
Source File: SnsIO.java From beam with Apache License 2.0 | 5 votes |
private BiConsumer<? super PublishResponse, ? super Throwable> getPublishResponse( DoFn<T, SnsResponse<T>>.ProcessContext context) { return (response, ex) -> { if (ex == null) { SnsResponse<T> snsResponse = SnsResponse.of(context.element(), response); context.output(snsResponse); } else { LOG.error("Error while publishing request to SNS", ex); throw new SnsWriteException("Error while publishing request to SNS", ex); } }; }
Example #27
Source File: CacheTest.java From beam with Apache License 2.0 | 5 votes |
/** * Test checks how the cache candidates map is populated by the runner when evaluating the * pipeline. */ @Test public void cacheCandidatesUpdaterTest() { SparkPipelineOptions options = createOptions(); Pipeline pipeline = Pipeline.create(options); PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar")); // First use of pCollection. pCollection.apply(Count.globally()); // Second use of pCollection. PCollectionView<List<String>> view = pCollection.apply(View.asList()); // Internally View.asList() creates a PCollection that underlies the PCollectionView, that // PCollection should not be cached as the SparkRunner does not access that PCollection to // access the PCollectionView. pipeline .apply(Create.of("foo", "baz")) .apply( ParDo.of( new DoFn<String, String>() { @ProcessElement public void processElement(ProcessContext processContext) { if (processContext.sideInput(view).contains(processContext.element())) { processContext.output(processContext.element()); } } }) .withSideInputs(view)); JavaSparkContext jsc = SparkContextFactory.getSparkContext(options); EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options); SparkRunner.CacheVisitor cacheVisitor = new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt); pipeline.traverseTopologically(cacheVisitor); assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection)); assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count()); }
Example #28
Source File: SplittableParDoNaiveBounded.java From beam with Apache License 2.0 | 5 votes |
private NestedProcessContext( DoFn<InputT, OutputT> fn, DoFn<KV<InputT, RestrictionT>, OutputT>.ProcessContext outerContext, InputT element, BoundedWindow window, TrackerT tracker, WatermarkEstimatorT watermarkEstimator) { fn.super(); this.window = window; this.outerContext = outerContext; this.element = element; this.tracker = tracker; this.watermarkEstimator = watermarkEstimator; }
Example #29
Source File: BeamEnumerableConverter.java From beam with Apache License 2.0 | 5 votes |
private static PipelineResult limitRun( PipelineOptions options, BeamRelNode node, DoFn<Row, Void> doFn, Queue<Row> values, int limitCount) { options.as(DirectOptions.class).setBlockOnRun(false); Pipeline pipeline = Pipeline.create(options); PCollection<Row> resultCollection = BeamSqlRelUtils.toPCollection(pipeline, node); resultCollection.apply(ParDo.of(doFn)); PipelineResult result = pipeline.run(); State state; while (true) { // Check pipeline state in every second state = result.waitUntilFinish(Duration.standardSeconds(1)); if (state != null && state.isTerminal()) { if (PipelineResult.State.FAILED.equals(state)) { throw new RuntimeException("Pipeline failed for unknown reason"); } break; } try { if (values.size() >= limitCount) { result.cancel(); break; } } catch (IOException e) { LOG.warn(e.toString()); break; } } return result; }
Example #30
Source File: DoFnOperatorTest.java From beam with Apache License 2.0 | 5 votes |
private <K, InT, OutT> OneInputStreamOperatorTestHarness<WindowedValue<InT>, WindowedValue<OutT>> createTestHarness( WindowingStrategy<Object, ?> windowingStrategy, DoFn<InT, OutT> fn, FullWindowedValueCoder<InT> inputCoder, FullWindowedValueCoder<OutT> outputCoder, Coder<?> keyCoder, TupleTag<OutT> outputTag, TypeInformation<K> keyCoderInfo, KeySelector<WindowedValue<InT>, K> keySelector) throws Exception { DoFnOperator<InT, OutT> doFnOperator = new DoFnOperator<>( fn, "stepName", inputCoder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, outputCoder), windowingStrategy, new HashMap<>(), /* side-input mapping */ Collections.emptyList(), /* side inputs */ PipelineOptionsFactory.as(FlinkPipelineOptions.class), keyCoder /* key coder */, keySelector, DoFnSchemaInformation.create(), Collections.emptyMap()); return new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, keySelector, keyCoderInfo); }