org.apache.beam.sdk.transforms.Combine Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.Combine. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AggregatorCombiner.java    From beam with Apache License 2.0 7 votes vote down vote up
public AggregatorCombiner(
    Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
    WindowingStrategy<?, ?> windowingStrategy,
    Coder<AccumT> accumulatorCoder,
    Coder<OutputT> outputCoder) {
  this.combineFn = combineFn;
  this.windowingStrategy = (WindowingStrategy<InputT, W>) windowingStrategy;
  this.timestampCombiner = windowingStrategy.getTimestampCombiner();
  this.accumulatorCoder =
      IterableCoder.of(
          WindowedValue.FullWindowedValueCoder.of(
              accumulatorCoder, windowingStrategy.getWindowFn().windowCoder()));
  this.outputCoder =
      IterableCoder.of(
          WindowedValue.FullWindowedValueCoder.of(
              outputCoder, windowingStrategy.getWindowFn().windowCoder()));
}
 
Example #2
Source File: HBaseIOIT.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Read the test dataset from hbase and validate its contents. */
private void runRead() {
  PCollection<Result> tableRows =
      pipelineRead.apply(HBaseIO.read().withConfiguration(conf).withTableId(TABLE_NAME));

  PAssert.thatSingleton(tableRows.apply("Count All", Count.<Result>globally()))
      .isEqualTo((long) numberOfRows);

  PCollection<String> consolidatedHashcode =
      tableRows
          .apply(ParDo.of(new SelectNameFn()))
          .apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());

  PAssert.that(consolidatedHashcode)
      .containsInAnyOrder(TestRow.getExpectedHashForRowCount(numberOfRows));

  pipelineRead.run().waitUntilFinish();
}
 
Example #3
Source File: CassandraIOIT.java    From beam with Apache License 2.0 6 votes vote down vote up
private void runRead() {
  PCollection<Scientist> output =
      pipelineRead.apply(
          CassandraIO.<Scientist>read()
              .withHosts(options.getCassandraHost())
              .withPort(options.getCassandraPort())
              .withMinNumberOfSplits(20)
              .withKeyspace(KEYSPACE)
              .withTable(TABLE)
              .withEntity(Scientist.class)
              .withCoder(SerializableCoder.of(Scientist.class)));

  PCollection<String> consolidatedHashcode =
      output
          .apply(ParDo.of(new SelectNameFn()))
          .apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());

  PAssert.thatSingleton(consolidatedHashcode)
      .isEqualTo(TestRow.getExpectedHashForRowCount(options.getNumberOfRecords()));

  pipelineRead.run().waitUntilFinish();
}
 
Example #4
Source File: HadoopFormatIOElasticTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Test to read data from embedded Elasticsearch instance and verify whether data is read
 * successfully.
 */
@Test
public void testHifIOWithElastic() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "a62a85f5f081e3840baf1028d4d6c6bc";
  Configuration conf = getConfiguration();
  PCollection<KV<Text, LinkedMapWritable>> esData =
      pipeline.apply(HadoopFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
  PCollection<Long> count = esData.apply(Count.globally());
  // Verify that the count of objects fetched using HIFInputFormat IO is correct.
  PAssert.thatSingleton(count).isEqualTo((long) TEST_DATA_ROW_COUNT);
  PCollection<LinkedMapWritable> values = esData.apply(Values.create());
  PCollection<String> textValues = values.apply(transformFunc);
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}
 
Example #5
Source File: HadoopFormatIOCassandraIT.java    From beam with Apache License 2.0 6 votes vote down vote up
/** This test reads data from the Cassandra instance and verifies if data is read successfully. */
@Test
public void testHIFReadForCassandra() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "1a30ad400afe4ebf5fde75f5d2d95408";
  Long expectedRecordsCount = 1000L;
  Configuration conf = getConfiguration(options);
  PCollection<KV<Long, String>> cassandraData =
      pipeline.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(expectedRecordsCount);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}
 
Example #6
Source File: FlinkStreamingTransformTranslators.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
boolean canTranslate(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
    FlinkStreamingTranslationContext context) {
  // if we have a merging window strategy and side inputs we cannot
  // translate as a proper combine. We have to group and then run the combine
  // over the final grouped values.
  PCollection<KV<K, InputT>> input = context.getInput(transform);

  @SuppressWarnings("unchecked")
  WindowingStrategy<?, BoundedWindow> windowingStrategy =
      (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();

  return windowingStrategy.getWindowFn().isNonMerging()
      || ((Combine.PerKey) transform).getSideInputs().isEmpty();
}
 
Example #7
Source File: HadoopFormatIOCassandraTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Test to read data from embedded Cassandra instance and verify whether data is read
 * successfully.
 */
@Test
public void testHIFReadForCassandra() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "1b9780833cce000138b9afa25ba63486";
  Configuration conf = getConfiguration();
  PCollection<KV<Long, String>> cassandraData =
      p.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  // Verify the count of data retrieved from Cassandra matches expected count.
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(TEST_DATA_ROW_COUNT);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  p.run().waitUntilFinish();
}
 
Example #8
Source File: SparkCombineFnTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private static Combine.CombineFn<Integer, Long, Long> getSumFn() {
  return new Combine.CombineFn<Integer, Long, Long>() {

    @Override
    public Long createAccumulator() {
      return 0L;
    }

    @Override
    public Long addInput(Long mutableAccumulator, Integer input) {
      return mutableAccumulator + input;
    }

    @Override
    public Long mergeAccumulators(Iterable<Long> accumulators) {
      return StreamSupport.stream(accumulators.spliterator(), false).mapToLong(e -> e).sum();
    }

    @Override
    public Long extractOutput(Long accumulator) {
      return accumulator;
    }
  };
}
 
Example #9
Source File: CombineTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testBinaryCombineWithSlidingWindows() {
  PCollection<Integer> input =
      pipeline
          .apply(
              Create.timestamped(
                  TimestampedValue.of(1, new Instant(1)),
                  TimestampedValue.of(3, new Instant(2)),
                  TimestampedValue.of(5, new Instant(3))))
          .apply(Window.into(SlidingWindows.of(Duration.millis(3)).every(Duration.millis(1))))
          .apply(
              Combine.globally(
                      Combine.BinaryCombineFn.of(
                          (SerializableBiFunction<Integer, Integer, Integer>)
                              (integer1, integer2) -> integer1 > integer2 ? integer1 : integer2))
                  .withoutDefaults());
  PAssert.that(input).containsInAnyOrder(1, 3, 5, 5, 5);
  pipeline.run();
}
 
Example #10
Source File: GroupByKeyTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private static <K, InputT, OutputT>
    SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> getSystemReduceFn(
        PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
        Pipeline pipeline,
        KvCoder<K, InputT> kvInputCoder) {
  if (transform instanceof GroupByKey) {
    return (SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow>)
        SystemReduceFn.buffering(kvInputCoder.getValueCoder());
  } else if (transform instanceof Combine.PerKey) {
    final CombineFnBase.GlobalCombineFn<? super InputT, ?, OutputT> combineFn =
        ((Combine.PerKey) transform).getFn();
    return SystemReduceFn.combining(
        kvInputCoder.getKeyCoder(),
        AppliedCombineFn.withInputCoder(combineFn, pipeline.getCoderRegistry(), kvInputCoder));
  } else {
    throw new RuntimeException("Transform " + transform + " cannot be translated as GroupByKey.");
  }
}
 
Example #11
Source File: CombineTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public FunctionSpec translate(
    AppliedPTransform<?, ?, Combine.PerKey<?, ?, ?>> transform, SdkComponents components)
    throws IOException {
  if (transform.getTransform().getSideInputs().isEmpty()) {
    GlobalCombineFn<?, ?, ?> combineFn = transform.getTransform().getFn();
    Coder<?> accumulatorCoder =
        extractAccumulatorCoder(combineFn, (AppliedPTransform) transform);
    return FunctionSpec.newBuilder()
        .setUrn(getUrn(transform.getTransform()))
        .setPayload(combinePayload(combineFn, accumulatorCoder, components).toByteString())
        .build();
  } else {
    // Combines with side inputs are translated as generic composites, which have a blank
    // FunctionSpec.
    return null;
  }
}
 
Example #12
Source File: CombineTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
private static <K, InputT, AccumT> Coder<AccumT> extractAccumulatorCoder(
    GlobalCombineFn<InputT, AccumT, ?> combineFn,
    AppliedPTransform<
            PCollection<KV<K, Iterable<InputT>>>, ?, Combine.GroupedValues<K, InputT, ?>>
        transform)
    throws IOException {
  try {
    @SuppressWarnings("unchecked")
    PCollection<KV<K, Iterable<InputT>>> mainInput =
        (PCollection<KV<K, Iterable<InputT>>>)
            Iterables.getOnlyElement(TransformInputs.nonAdditionalInputs(transform));
    KvCoder<K, Iterable<InputT>> kvCoder = (KvCoder<K, Iterable<InputT>>) mainInput.getCoder();
    IterableCoder<InputT> iterCoder = (IterableCoder<InputT>) kvCoder.getValueCoder();
    return combineFn.getAccumulatorCoder(
        transform.getPipeline().getCoderRegistry(), iterCoder.getElemCoder());
  } catch (CannotProvideCoderException e) {
    throw new IOException("Could not obtain a Coder for the accumulator", e);
  }
}
 
Example #13
Source File: CombineTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCombineGloballyPreservesWindowing() {
  PCollection<Integer> input =
      pipeline
          .apply(
              Create.timestamped(
                  TimestampedValue.of(1, new Instant(1)),
                  TimestampedValue.of(2, new Instant(2)),
                  TimestampedValue.of(3, new Instant(11)),
                  TimestampedValue.of(4, new Instant(3)),
                  TimestampedValue.of(5, new Instant(11)),
                  TimestampedValue.of(6, new Instant(12))))
          .apply(Window.into(FixedWindows.of(Duration.millis(10))))
          .apply(Combine.globally(Sum.ofIntegers()).withoutDefaults());
  PAssert.that(input).containsInAnyOrder(7, 14);
}
 
Example #14
Source File: WindowTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesCustomWindowMerging.class})
public void testMergingCustomWindows() {
  Instant startInstant = new Instant(0L);
  PCollection<String> inputCollection =
      pipeline.apply(
          Create.timestamped(
              TimestampedValue.of("big", startInstant.plus(Duration.standardSeconds(10))),
              TimestampedValue.of("small1", startInstant.plus(Duration.standardSeconds(20))),
              // This one will be outside of bigWindow thus not merged
              TimestampedValue.of("small2", startInstant.plus(Duration.standardSeconds(39)))));
  PCollection<String> windowedCollection =
      inputCollection.apply(Window.into(new CustomWindowFn<>()));
  PCollection<Long> count =
      windowedCollection.apply(Combine.globally(Count.<String>combineFn()).withoutDefaults());
  // "small1" and "big" elements merged into bigWindow "small2" not merged
  // because timestamp is not in bigWindow
  PAssert.that("Wrong number of elements in output collection", count).containsInAnyOrder(2L, 1L);
  pipeline.run();
}
 
Example #15
Source File: WindowTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesCustomWindowMerging.class})
public void testMergingCustomWindowsKeyedCollection() {
  Instant startInstant = new Instant(0L);
  PCollection<KV<Integer, String>> inputCollection =
      pipeline.apply(
          Create.timestamped(
              TimestampedValue.of(
                  KV.of(0, "big"), startInstant.plus(Duration.standardSeconds(10))),
              TimestampedValue.of(
                  KV.of(1, "small1"), startInstant.plus(Duration.standardSeconds(20))),
              // This element is not contained within the bigWindow and not merged
              TimestampedValue.of(
                  KV.of(2, "small2"), startInstant.plus(Duration.standardSeconds(39)))));
  PCollection<KV<Integer, String>> windowedCollection =
      inputCollection.apply(Window.into(new CustomWindowFn<>()));
  PCollection<Long> count =
      windowedCollection.apply(
          Combine.globally(Count.<KV<Integer, String>>combineFn()).withoutDefaults());
  // "small1" and "big" elements merged into bigWindow "small2" not merged
  // because it is not contained in bigWindow
  PAssert.that("Wrong number of elements in output collection", count).containsInAnyOrder(2L, 1L);
  pipeline.run();
}
 
Example #16
Source File: CombineTranslation.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public FunctionSpec translate(
    AppliedPTransform<?, ?, Combine.Globally<?, ?>> transform, SdkComponents components)
    throws IOException {
  if (transform.getTransform().getSideInputs().isEmpty()) {
    return FunctionSpec.newBuilder()
        .setUrn(getUrn(transform.getTransform()))
        .setPayload(
            payloadForCombineGlobally((AppliedPTransform) transform, components).toByteString())
        .build();
  } else {
    // Combines with side inputs are translated as generic composites, which have a blank
    // FunctionSpec.
    return null;
  }
}
 
Example #17
Source File: CombineRunnersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Before
public void createPipeline() throws Exception {
  // Create pipeline with an input pCollection, combine, and output pCollection.
  TestCombineFn combineFn = new TestCombineFn();
  Combine.PerKey<String, String, Integer> combine = Combine.perKey(combineFn);

  Pipeline p = Pipeline.create();
  PCollection<KV<String, String>> inputPCollection = p.apply(Create.of(KV.of("unused", "0")));
  inputPCollection.setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
  PCollection<KV<String, Integer>> outputPCollection =
      inputPCollection.apply(TEST_COMBINE_ID, combine);
  outputPCollection.setCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()));

  // Create FnApi protos needed for the runner.
  SdkComponents sdkComponents = SdkComponents.create(p.getOptions());
  pProto = PipelineTranslation.toProto(p, sdkComponents);
  inputPCollectionId = sdkComponents.registerPCollection(inputPCollection);
  outputPCollectionId = sdkComponents.registerPCollection(outputPCollection);
  pTransform = pProto.getComponents().getTransformsOrThrow(TEST_COMBINE_ID);
}
 
Example #18
Source File: HCatalogIOIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void writeAndReadAll() {
  pipelineWrite
      .apply("Generate sequence", Create.of(buildHCatRecords(options.getNumberOfRecords())))
      .apply(
          HCatalogIO.write()
              .withConfigProperties(configProperties)
              .withDatabase(options.getHCatalogHiveDatabaseName())
              .withTable(tableName));
  pipelineWrite.run().waitUntilFinish();

  PCollection<String> testRecords =
      pipelineRead
          .apply(
              HCatalogIO.read()
                  .withConfigProperties(configProperties)
                  .withDatabase(options.getHCatalogHiveDatabaseName())
                  .withTable(tableName))
          .apply(ParDo.of(new CreateHCatFn()));

  PCollection<String> consolidatedHashcode =
      testRecords.apply("Calculate hashcode", Combine.globally(new HashingFn()));

  String expectedHash = getHashForRecordCount(options.getNumberOfRecords(), EXPECTED_HASHES);
  PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);

  pipelineRead.run().waitUntilFinish();
}
 
Example #19
Source File: BucketingFunction.java    From beam with Apache License 2.0 5 votes vote down vote up
public BucketingFunction(
    long bucketWidthMs,
    int numSignificantBuckets,
    int numSignificantSamples,
    Combine.BinaryCombineLongFn function) {
  this.bucketWidthMs = bucketWidthMs;
  this.numSignificantBuckets = numSignificantBuckets;
  this.numSignificantSamples = numSignificantSamples;
  this.function = function;
  this.buckets = new HashMap<>();
}
 
Example #20
Source File: AggregateRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PCollection<IndexedRecord> indexedRecordPCollection) {
    // Return an empty result if there are no operations in the list. This is normally not a permitted operation.
    if (operationFieldPathList.size() == 0)
        return (PCollection<IndexedRecord>) (PCollection) indexedRecordPCollection.getPipeline().apply(
                Create.empty(AvroCoder.of(AvroUtils.createEmptySchema())));

    PCollection<KV<IndexedRecord, IndexedRecord>> kv = indexedRecordPCollection
            .apply(ParDo.of(new ExtractKVFn(new ArrayList<>(groupByFieldPathList),
                    new ArrayList<>(operationFieldPathList))))
            .setCoder(KvCoder.of(LazyAvroCoder.of(), LazyAvroCoder.of()));

    PCollection<KV<IndexedRecord, IndexedRecord>> aggregateResult = kv
            .apply(Combine.<IndexedRecord, IndexedRecord, IndexedRecord> perKey(new AggregateCombineFn(properties)))
            .setCoder(KvCoder.of(LazyAvroCoder.of(), NullableCoder.of(LazyAvroCoder.of())));

    PCollection<IndexedRecord> result = aggregateResult
            .apply(ParDo.of(new DoFn<KV<IndexedRecord, IndexedRecord>, KV<IndexedRecord, IndexedRecord>>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                    /**
                     * Filter null value when AggregateCombineFn for nothing, see {@link
                     * org.talend.components.processing.runtime.aggregate.AggregateCombineFn#extractOutput(AggregateCombineFn.AggregateAccumulator)}
                     */
                    if (c.element().getValue() != null) {
                        c.output(c.element());
                    }
                }
            }))
            .apply(ParDo.of(new MergeKVFn()))
            .setCoder(LazyAvroCoder.of());

    return result;
}
 
Example #21
Source File: KinesisIOIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Read test dataset from Kinesis stream. */
private void runRead() {
  PCollection<KinesisRecord> output =
      pipelineRead.apply(
          KinesisIO.read()
              .withStreamName(options.getAwsKinesisStream())
              .withAWSClientsProvider(
                  options.getAwsAccessKey(),
                  options.getAwsSecretKey(),
                  Regions.fromName(options.getAwsKinesisRegion()))
              .withMaxNumRecords(numberOfRows)
              // to prevent endless running in case of error
              .withMaxReadTime(Duration.standardMinutes(10))
              .withInitialPositionInStream(InitialPositionInStream.AT_TIMESTAMP)
              .withInitialTimestampInStream(now)
              .withRequestRecordsLimit(1000));

  PAssert.thatSingleton(output.apply("Count All", Count.globally()))
      .isEqualTo((long) numberOfRows);

  PCollection<String> consolidatedHashcode =
      output
          .apply(ParDo.of(new ExtractDataValues()))
          .apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());

  PAssert.that(consolidatedHashcode)
      .containsInAnyOrder(TestRow.getExpectedHashForRowCount(numberOfRows));

  pipelineRead.run().waitUntilFinish();
}
 
Example #22
Source File: FlinkStateInternals.java    From beam with Apache License 2.0 5 votes vote down vote up
FlinkCombiningState(
    KeyedStateBackend<ByteBuffer> flinkStateBackend,
    String stateId,
    Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
    StateNamespace namespace,
    Coder<AccumT> accumCoder) {

  this.namespace = namespace;
  this.stateId = stateId;
  this.combineFn = combineFn;
  this.flinkStateBackend = flinkStateBackend;

  flinkStateDescriptor =
      new ValueStateDescriptor<>(stateId, new CoderTypeSerializer<>(accumCoder));
}
 
Example #23
Source File: SamzaPublishViewTransformOverride.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<ElemT> expand(PCollection<ElemT> input) {
  // This actually creates a branch in the graph that publishes the view but then returns
  // the original input. This is copied from the Flink runner.
  input
      .apply(Combine.globally(new Concatenate<ElemT>()).withoutDefaults())
      .apply(new SamzaPublishView<>(view));
  return input;
}
 
Example #24
Source File: CombineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testToProto() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  input.apply(Combine.globally(combineFn));
  final AtomicReference<AppliedPTransform<?, ?, Combine.Globally<?, ?>>> combine =
      new AtomicReference<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void leaveCompositeTransform(Node node) {
          if (node.getTransform() instanceof Combine.Globally) {
            checkState(combine.get() == null);
            combine.set((AppliedPTransform) node.toAppliedPTransform(getPipeline()));
          }
        }
      });
  checkState(combine.get() != null);
  assertEquals(combineFn, combine.get().getTransform().getFn());

  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  CombinePayload combineProto =
      CombineTranslation.CombineGloballyPayloadTranslator.payloadForCombineGlobally(
          (AppliedPTransform) combine.get(), sdkComponents);
  RunnerApi.Components componentsProto = sdkComponents.toComponents();

  assertEquals(
      combineFn.getAccumulatorCoder(pipeline.getCoderRegistry(), input.getCoder()),
      getAccumulatorCoder(combineProto, RehydratedComponents.forComponents(componentsProto)));
  assertEquals(
      combineFn,
      SerializableUtils.deserializeFromByteArray(
          combineProto.getCombineFn().getPayload().toByteArray(), "CombineFn"));
}
 
Example #25
Source File: Group.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollection<InputT> input) {
  SchemaAggregateFn.Inner fn = schemaAggregateFn.withSchema(input.getSchema());
  return input
      .apply("toRows", Convert.toRows())
      .apply("Global Combine", Combine.globally(fn))
      .setRowSchema(fn.getOutputSchema());
}
 
Example #26
Source File: CombineTranslation.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Produces a {@link RunnerApi.CombinePayload} from a {@link Combine.Globally}. */
@VisibleForTesting
static <InputT, OutputT> CombinePayload payloadForCombineGlobally(
    final AppliedPTransform<
            PCollection<InputT>, PCollection<OutputT>, Combine.Globally<InputT, OutputT>>
        transform,
    final SdkComponents components)
    throws IOException {
  GlobalCombineFn<?, ?, ?> combineFn = transform.getTransform().getFn();
  Coder<?> accumulatorCoder = extractAccumulatorCoder(combineFn, (AppliedPTransform) transform);
  return combinePayload(combineFn, accumulatorCoder, components);
}
 
Example #27
Source File: KafkaIOIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testKafkaIOReadsAndWritesCorrectly() throws IOException {
  writePipeline
      .apply("Generate records", Read.from(new SyntheticBoundedSource(sourceOptions)))
      .apply("Measure write time", ParDo.of(new TimeMonitor<>(NAMESPACE, WRITE_TIME_METRIC_NAME)))
      .apply("Write to Kafka", writeToKafka());

  PCollection<String> hashcode =
      readPipeline
          .apply("Read from Kafka", readFromKafka())
          .apply(
              "Measure read time", ParDo.of(new TimeMonitor<>(NAMESPACE, READ_TIME_METRIC_NAME)))
          .apply("Map records to strings", MapElements.via(new MapKafkaRecordsToStrings()))
          .apply("Calculate hashcode", Combine.globally(new HashingFn()).withoutDefaults());

  PAssert.thatSingleton(hashcode).isEqualTo(expectedHashcode);

  PipelineResult writeResult = writePipeline.run();
  writeResult.waitUntilFinish();

  PipelineResult readResult = readPipeline.run();
  PipelineResult.State readState =
      readResult.waitUntilFinish(Duration.standardSeconds(options.getReadTimeout()));

  cancelIfTimeouted(readResult, readState);

  Set<NamedTestResult> metrics = readMetrics(writeResult, readResult);
  IOITMetrics.publish(options.getBigQueryDataset(), options.getBigQueryTable(), metrics);
  IOITMetrics.publishToInflux(TEST_ID, TIMESTAMP, metrics, settings);
}
 
Example #28
Source File: CombineValuesFnFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testCombineValuesFnAll() throws Exception {
  TestReceiver receiver = new TestReceiver();

  Combine.CombineFn<Integer, CountSum, String> combiner = (new MeanInts());

  ParDoFn combineParDoFn =
      createCombineValuesFn(
          CombinePhase.ALL,
          combiner,
          StringUtf8Coder.of(),
          BigEndianIntegerCoder.of(),
          new CountSumCoder(),
          WindowingStrategy.globalDefault());

  combineParDoFn.startBundle(receiver);
  combineParDoFn.processElement(
      WindowedValue.valueInGlobalWindow(KV.of("a", Arrays.asList(5, 6, 7))));
  combineParDoFn.processElement(
      WindowedValue.valueInGlobalWindow(KV.of("b", Arrays.asList(1, 3, 7))));
  combineParDoFn.processElement(
      WindowedValue.valueInGlobalWindow(KV.of("c", Arrays.asList(3, 6, 8, 9))));
  combineParDoFn.finishBundle();

  Object[] expectedReceivedElems = {
    WindowedValue.valueInGlobalWindow(KV.of("a", String.format("%.1f", 6.0))),
    WindowedValue.valueInGlobalWindow(KV.of("b", String.format("%.1f", 3.7))),
    WindowedValue.valueInGlobalWindow(KV.of("c", String.format("%.1f", 6.5))),
  };
  assertArrayEquals(expectedReceivedElems, receiver.receivedElems.toArray());
}
 
Example #29
Source File: DataflowPipelineTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
private <K, InputT, OutputT> void translateHelper(
    final CombineGroupedValues<K, InputT, OutputT> primitiveTransform,
    TranslationContext context) {
  Combine.GroupedValues<K, InputT, OutputT> originalTransform =
      primitiveTransform.getOriginalCombine();
  StepTranslationContext stepContext =
      context.addStep(primitiveTransform, "CombineValues");
  translateInputs(
      stepContext,
      context.getInput(primitiveTransform),
      originalTransform.getSideInputs(),
      context);

  AppliedCombineFn<? super K, ? super InputT, ?, OutputT> fn =
      originalTransform.getAppliedFn(
          context.getInput(primitiveTransform).getPipeline().getCoderRegistry(),
          context.getInput(primitiveTransform).getCoder(),
          context.getInput(primitiveTransform).getWindowingStrategy());

  stepContext.addEncodingInput(fn.getAccumulatorCoder());

  List<String> experiments = context.getPipelineOptions().getExperiments();
  boolean isFnApi = experiments != null && experiments.contains("beam_fn_api");

  if (isFnApi) {
    String ptransformId =
        context.getSdkComponents().getPTransformIdOrThrow(context.getCurrentParent());
    stepContext.addInput(PropertyNames.SERIALIZED_FN, ptransformId);
  } else {
    stepContext.addInput(
        PropertyNames.SERIALIZED_FN, byteArrayToJsonString(serializeToByteArray(fn)));
  }

  stepContext.addOutput(PropertyNames.OUTPUT, context.getOutput(primitiveTransform));
}
 
Example #30
Source File: MultiStepCombineTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultiStepCombineWindowed() {
  SlidingWindows windowFn = SlidingWindows.of(Duration.millis(6L)).every(Duration.millis(3L));
  PCollection<KV<String, Long>> combined =
      pipeline
          .apply(
              Create.timestamped(
                  TimestampedValue.of(KV.of("foo", 1L), new Instant(1L)),
                  TimestampedValue.of(KV.of("bar", 2L), new Instant(2L)),
                  TimestampedValue.of(KV.of("bizzle", 3L), new Instant(3L)),
                  TimestampedValue.of(KV.of("bar", 4L), new Instant(4L)),
                  TimestampedValue.of(KV.of("bizzle", 11L), new Instant(11L))))
          .apply(Window.into(windowFn))
          .apply(Combine.perKey(new MultiStepCombineFn()));

  PAssert.that("Windows should combine only elements in their windows", combined)
      .inWindow(new IntervalWindow(new Instant(0L), Duration.millis(6L)))
      .containsInAnyOrder(KV.of("foo", 1L), KV.of("bar", 6L), KV.of("bizzle", 3L));
  PAssert.that("Elements should appear in all the windows they are assigned to", combined)
      .inWindow(new IntervalWindow(new Instant(-3L), Duration.millis(6L)))
      .containsInAnyOrder(KV.of("foo", 1L), KV.of("bar", 2L));
  PAssert.that(combined)
      .inWindow(new IntervalWindow(new Instant(6L), Duration.millis(6L)))
      .containsInAnyOrder(KV.of("bizzle", 11L));
  PAssert.that(combined)
      .containsInAnyOrder(
          KV.of("foo", 1L),
          KV.of("foo", 1L),
          KV.of("bar", 6L),
          KV.of("bar", 2L),
          KV.of("bar", 4L),
          KV.of("bizzle", 11L),
          KV.of("bizzle", 11L),
          KV.of("bizzle", 3L),
          KV.of("bizzle", 3L));
  pipeline.run();
}