org.apache.beam.sdk.transforms.GroupByKey Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.GroupByKey. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DataflowRunner.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<K, Iterable<V>>> expand(PCollection<KV<K, V>> input) {
  return input
      .apply("GroupAll", GroupByKey.create())
      .apply(
          "SplitIntoBatches",
          ParDo.of(
              new DoFn<KV<K, Iterable<V>>, KV<K, Iterable<V>>>() {
                @ProcessElement
                public void process(ProcessContext c) {
                  // Iterators.partition lazily creates the partitions as they are accessed
                  // allowing it to partition very large iterators.
                  Iterator<List<V>> iterator =
                      Iterators.partition(c.element().getValue().iterator(), (int) batchSize);

                  // Note that GroupIntoBatches only outputs when the batch is non-empty.
                  while (iterator.hasNext()) {
                    c.output(KV.of(c.element().getKey(), iterator.next()));
                  }
                }
              }));
}
 
Example #2
Source File: CacheTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldCacheTest() {
  SparkPipelineOptions options = createOptions();
  options.setCacheDisabled(true);
  Pipeline pipeline = Pipeline.create(options);

  Values<String> valuesTransform = Create.of("foo", "bar");
  PCollection pCollection = mock(PCollection.class);

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  ctxt.getCacheCandidates().put(pCollection, 2L);

  assertFalse(ctxt.shouldCache(valuesTransform, pCollection));

  options.setCacheDisabled(false);
  assertTrue(ctxt.shouldCache(valuesTransform, pCollection));

  GroupByKey<String, String> gbkTransform = GroupByKey.create();
  assertFalse(ctxt.shouldCache(gbkTransform, pCollection));
}
 
Example #3
Source File: WriteFiles.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<FileResult<DestinationT>> expand(PCollection<UserT> input) {
  List<PCollectionView<?>> shardingSideInputs = Lists.newArrayList(getSideInputs());
  if (numShardsView != null) {
    shardingSideInputs.add(numShardsView);
  }

  ShardingFunction<UserT, DestinationT> shardingFunction =
      getShardingFunction() == null
          ? new RandomShardingFunction(destinationCoder)
          : getShardingFunction();

  return input
      .apply(
          "ApplyShardingKey",
          ParDo.of(new ApplyShardingFunctionFn(shardingFunction, numShardsView))
              .withSideInputs(shardingSideInputs))
      .setCoder(KvCoder.of(ShardedKeyCoder.of(VarIntCoder.of()), input.getCoder()))
      .apply("GroupIntoShards", GroupByKey.create())
      .apply(
          "WriteShardsIntoTempFiles",
          ParDo.of(new WriteShardsIntoTempFilesFn()).withSideInputs(getSideInputs()))
      .setCoder(fileResultCoder);
}
 
Example #4
Source File: CreateStreamTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testElementsAtAlmostPositiveInfinity() throws IOException {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  CreateStream<String> source =
      CreateStream.of(StringUtf8Coder.of(), batchDuration())
          .nextBatch(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceNextBatchWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(source)
          .apply(Window.into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(GlobalWindow.INSTANCE.maxTimestamp()))
      .containsInAnyOrder("foo", "bar");
  p.run();
}
 
Example #5
Source File: NonMergingGroupByKeyTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testDisabledReIterationThrowsAnException() {
  // If output during closing is not supported, we can not chain DoFns and results
  // are therefore materialized during output serialization.
  Assume.assumeTrue(FlinkCapabilities.supportsOutputDuringClosing());
  final Pipeline p = FlinkTestPipeline.createForBatch();
  p.apply(Create.of(Arrays.asList(KV.of("a", 1), KV.of("b", 2), KV.of("c", 3))))
      .apply(GroupByKey.create())
      .apply(ParDo.of(new ReiterateDoFn<>()));
  Pipeline.PipelineExecutionException resultException = null;
  try {
    p.run().waitUntilFinish();
  } catch (Pipeline.PipelineExecutionException exception) {
    resultException = exception;
  }
  Assert.assertEquals(
      IllegalStateException.class, Objects.requireNonNull(resultException).getCause().getClass());
  Assert.assertTrue(
      resultException.getCause().getMessage().contains("GBK result is not re-iterable."));
}
 
Example #6
Source File: UnboundedWrite.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<KV<K, V>> in) {
    // Make sure that a window has been applied.
    in = ofDefaultWindow(in);

    // Add an artificial GroupByKey to collect the window results together.
    PCollection<KV<Instant, KV<K, V>>> pc2 =
            in.apply("GroupToOneShard", ParDo.of(new GroupToOneShard<KV<K, V>>())).setCoder(
                    KvCoder.of(InstantCoder.of(), in.getCoder()));

    PCollection<KV<Instant, Iterable<KV<K, V>>>> pc3 = pc2.apply(GroupByKey.<Instant, KV<K, V>> create());

    pc3.apply("UnboundedWrite", ParDo.of(new UnboundedWriteToFile<K, V>(sink)));

    return PDone.in(in.getPipeline());
}
 
Example #7
Source File: GatherAllPanes.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Iterable<ValueInSingleWindow<T>>> expand(PCollection<T> input) {
  WindowFn<?, ?> originalWindowFn = input.getWindowingStrategy().getWindowFn();

  return input
      .apply(Reify.windows())
      .apply(
          WithKeys.<Integer, ValueInSingleWindow<T>>of(0)
              .withKeyType(new TypeDescriptor<Integer>() {}))
      .apply(
          Window.into(
                  new IdentityWindowFn<KV<Integer, ValueInSingleWindow<T>>>(
                      originalWindowFn.windowCoder()))
              .triggering(Never.ever())
              .withAllowedLateness(input.getWindowingStrategy().getAllowedLateness())
              .discardingFiredPanes())
      // all values have the same key so they all appear as a single output element
      .apply(GroupByKey.create())
      .apply(Values.create())
      .setWindowingStrategyInternal(input.getWindowingStrategy());
}
 
Example #8
Source File: GroupByKeyTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private static <K, InputT, OutputT>
    SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> getSystemReduceFn(
        PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
        Pipeline pipeline,
        KvCoder<K, InputT> kvInputCoder) {
  if (transform instanceof GroupByKey) {
    return (SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow>)
        SystemReduceFn.buffering(kvInputCoder.getValueCoder());
  } else if (transform instanceof Combine.PerKey) {
    final CombineFnBase.GlobalCombineFn<? super InputT, ?, OutputT> combineFn =
        ((Combine.PerKey) transform).getFn();
    return SystemReduceFn.combining(
        kvInputCoder.getKeyCoder(),
        AppliedCombineFn.withInputCoder(combineFn, pipeline.getCoderRegistry(), kvInputCoder));
  } else {
    throw new RuntimeException("Transform " + transform + " cannot be translated as GroupByKey.");
  }
}
 
Example #9
Source File: GroupByKeyLoadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
void loadTest() throws IOException {
  Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions());

  PCollection<KV<byte[], byte[]>> input =
      pipeline
          .apply("Read input", readFromSource(sourceOptions))
          .apply("Collect start time metrics", ParDo.of(runtimeMonitor))
          .apply(
              "Total bytes monitor",
              ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count")));

  input = applyWindowing(input);

  for (int branch = 0; branch < options.getFanout(); branch++) {
    applyStepIfPresent(input, format("Synthetic step (%s)", branch), syntheticStep)
        .apply(format("Group by key (%s)", branch), GroupByKey.create())
        .apply(
            format("Ungroup and reiterate (%s)", branch),
            ParDo.of(new UngroupAndReiterate(options.getIterations())))
        .apply(format("Collect end time metrics (%s)", branch), ParDo.of(runtimeMonitor));
  }
}
 
Example #10
Source File: GroupByKeyTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGroupByKey() {
  List<KV<Integer, Integer>> elems = new ArrayList<>();
  elems.add(KV.of(1, 1));
  elems.add(KV.of(1, 3));
  elems.add(KV.of(1, 5));
  elems.add(KV.of(2, 2));
  elems.add(KV.of(2, 4));
  elems.add(KV.of(2, 6));

  PCollection<KV<Integer, Iterable<Integer>>> input =
      pipeline.apply(Create.of(elems)).apply(GroupByKey.create());
  PAssert.thatMap(input)
      .satisfies(
          results -> {
            assertThat(results.get(1), containsInAnyOrder(1, 3, 5));
            assertThat(results.get(2), containsInAnyOrder(2, 4, 6));
            return null;
          });
  pipeline.run();
}
 
Example #11
Source File: TestStreamTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({NeedsRunner.class, UsesTestStream.class})
public void testElementsAtAlmostPositiveInfinity() {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  TestStream<String> stream =
      TestStream.create(StringUtf8Coder.of())
          .addElements(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(stream)
          .apply(into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(endOfGlobalWindow))
      .containsInAnyOrder("foo", "bar");
  p.run();
}
 
Example #12
Source File: SortValuesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSecondaryKeySorting() {
  // Create a PCollection of <Key, <SecondaryKey, Value>> pairs.
  PCollection<KV<String, KV<String, Integer>>> input =
      p.apply(
          Create.of(
              Arrays.asList(
                  KV.of("key1", KV.of("secondaryKey2", 20)),
                  KV.of("key2", KV.of("secondaryKey2", 200)),
                  KV.of("key1", KV.of("secondaryKey3", 30)),
                  KV.of("key1", KV.of("secondaryKey1", 10)),
                  KV.of("key2", KV.of("secondaryKey1", 100)))));

  // Group by Key, bringing <SecondaryKey, Value> pairs for the same Key together.
  PCollection<KV<String, Iterable<KV<String, Integer>>>> grouped =
      input.apply(GroupByKey.create());

  // For every Key, sort the iterable of <SecondaryKey, Value> pairs by SecondaryKey.
  PCollection<KV<String, Iterable<KV<String, Integer>>>> groupedAndSorted =
      grouped.apply(SortValues.create(BufferedExternalSorter.options()));

  PAssert.that(groupedAndSorted)
      .satisfies(new AssertThatHasExpectedContentsForTestSecondaryKeySorting());

  p.run();
}
 
Example #13
Source File: DataflowGroupByKeyTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testInvalidWindowsService() {
  Pipeline p = createTestServiceRunner();

  List<KV<String, Integer>> ungroupedPairs = Arrays.asList();

  PCollection<KV<String, Integer>> input =
      p.apply(
              Create.of(ungroupedPairs)
                  .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of())))
          .apply(Window.into(Sessions.withGapDuration(Duration.standardMinutes(1))));

  thrown.expect(IllegalStateException.class);
  thrown.expectMessage("GroupByKey must have a valid Window merge function");
  input.apply("GroupByKey", GroupByKey.create()).apply("GroupByKeyAgain", GroupByKey.create());
}
 
Example #14
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param Document indexes
 * @return a POJO containing 2 PCollections: Unique docs, and Duplicates
 */
private static ContentDuplicateOrNot filterSoftDuplicates(
		PCollection<ContentIndexSummary> indexes) {
	// 
	PCollectionTuple dedupeOrNot = indexes
		.apply("Extract Text grouping key", 
			ParDo.of(new GetContentIndexSummaryKeyFn()))
		.apply("Group by Text grouping key", 
			GroupByKey.<ContentSoftDeduplicationKey, ContentIndexSummary>create())
		.apply("Eliminate Text dupes", 
			ParDo.of(new EliminateTextDupes())
				.withOutputTags(PipelineTags.indexedContentNotToDedupeTag, 
					TupleTagList.of(PipelineTags.indexedContentToDedupeTag))); 	
		
	PCollection<TableRow> dedupedWebresources = 
		dedupeOrNot.get(PipelineTags.indexedContentToDedupeTag)
			.apply(ParDo.of(new CreateWebresourceTableRowFromDupeIndexSummaryFn()));
	
	ContentDuplicateOrNot contentDuplicateOrNot = new ContentDuplicateOrNot(
		dedupeOrNot.get(PipelineTags.indexedContentNotToDedupeTag),
		dedupedWebresources);
	
	return contentDuplicateOrNot;
}
 
Example #15
Source File: DataflowPTransformMatchersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Creates a simple pipeline with a {@link Combine.GroupedValues} with side inputs. */
private static TestPipeline createCombineGroupedValuesWithSideInputsPipeline() {
  TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false);
  PCollection<KV<String, Integer>> input =
      pipeline
          .apply(Create.of(KV.of("key", 1)))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()));
  PCollection<String> sideInput = pipeline.apply(Create.of("side input"));
  PCollectionView<String> sideInputView = sideInput.apply(View.asSingleton());

  input
      .apply(GroupByKey.create())
      .apply(
          Combine.<String, Integer, Integer>groupedValues(new SumCombineFnWithContext())
              .withSideInputs(sideInputView));

  return pipeline;
}
 
Example #16
Source File: CloningBundleFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void keyedBundleWorkingCoderSucceedsClonesOutput() {
  PCollection<Integer> created = p.apply(Create.of(1, 3).withCoder(VarIntCoder.of()));

  PCollection<KV<String, Iterable<Integer>>> keyed =
      created
          .apply(WithKeys.of("foo"))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))
          .apply(GroupByKey.create());
  WindowedValue<KV<String, Iterable<Integer>>> foos =
      WindowedValue.valueInGlobalWindow(
          KV.<String, Iterable<Integer>>of("foo", ImmutableList.of(1, 3)));
  CommittedBundle<KV<String, Iterable<Integer>>> keyedBundle =
      factory
          .createKeyedBundle(StructuralKey.of("foo", StringUtf8Coder.of()), keyed)
          .add(foos)
          .commit(Instant.now());

  assertThat(keyedBundle.getElements(), containsInAnyOrder(foos));
  assertThat(
      Iterables.getOnlyElement(keyedBundle.getElements()).getValue(),
      not(theInstance(foos.getValue())));
  assertThat(keyedBundle.getPCollection(), equalTo(keyed));
  assertThat(keyedBundle.getKey(), equalTo(StructuralKey.of("foo", StringUtf8Coder.of())));
}
 
Example #17
Source File: BigQueryMerger.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<K, V>> expand(PCollection<KV<K, V>> input) {
  return input
      .apply(
          Window.<KV<K, V>>into(new GlobalWindows())
              .discardingFiredPanes()
              .triggering(
                  Repeatedly.forever(
                      AfterProcessingTime.pastFirstElementInPane()
                          .plusDelayOf(Duration.ZERO)
                          .alignedTo(intervalDuration, org.joda.time.Instant.now()))))
      .apply(GroupByKey.create())
      .apply(
          ParDo.of(
              new DoFn<KV<K, Iterable<V>>, KV<K, V>>() {
                @ProcessElement
                public void process(ProcessContext c) {
                  LOG.debug(
                      "TS: {} | Element: {} | Pane: {}", c.timestamp(), c.element(), c.pane());
                  Iterator<V> it = c.element().getValue().iterator();
                  if (it.hasNext()) {
                    c.output(KV.of(c.element().getKey(), it.next()));
                  }
                }
              }));
}
 
Example #18
Source File: BigQueryMerger.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<K, V>> expand(PCollection<KV<K, V>> input) {
  return input
      .apply(
          Window.<KV<K, V>>into(new GlobalWindows())
              .discardingFiredPanes()
              .triggering(
                  Repeatedly.forever(
                      AfterProcessingTime.pastFirstElementInPane()
                          .plusDelayOf(Duration.ZERO)
                          .alignedTo(intervalDuration, org.joda.time.Instant.now()))))
      .apply(GroupByKey.create())
      .apply(
          ParDo.of(
              new DoFn<KV<K, Iterable<V>>, KV<K, V>>() {
                @ProcessElement
                public void process(ProcessContext c) {
                  LOG.debug(
                      "TS: {} | Element: {} | Pane: {}", c.timestamp(), c.element(), c.pane());
                  Iterator<V> it = c.element().getValue().iterator();
                  if (it.hasNext()) {
                    c.output(KV.of(c.element().getKey(), it.next()));
                  }
                }
              }));
}
 
Example #19
Source File: GroupByKeyTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testGroupByKeyPreservesWindowing() {
  pipeline
      .apply(
          Create.timestamped(
              TimestampedValue.of(KV.of(1, 1), new Instant(1)),
              TimestampedValue.of(KV.of(1, 3), new Instant(2)),
              TimestampedValue.of(KV.of(1, 5), new Instant(11)),
              TimestampedValue.of(KV.of(2, 2), new Instant(3)),
              TimestampedValue.of(KV.of(2, 4), new Instant(11)),
              TimestampedValue.of(KV.of(2, 6), new Instant(12))))
      .apply(Window.into(FixedWindows.of(Duration.millis(10))))
      .apply(GroupByKey.create())
      // do manual assertion for windows because Passert do not support multiple kv with same key
      // (because multiple windows)
      .apply(
          ParDo.of(
              new DoFn<KV<Integer, Iterable<Integer>>, KV<Integer, Iterable<Integer>>>() {

                @ProcessElement
                public void processElement(ProcessContext context) {
                  KV<Integer, Iterable<Integer>> element = context.element();
                  if (element.getKey() == 1) {
                    if (Iterables.size(element.getValue()) == 2) {
                      assertThat(element.getValue(), containsInAnyOrder(1, 3)); // window [0-10)
                    } else {
                      assertThat(element.getValue(), containsInAnyOrder(5)); // window [10-20)
                    }
                  } else { // key == 2
                    if (Iterables.size(element.getValue()) == 2) {
                      assertThat(element.getValue(), containsInAnyOrder(4, 6)); // window [10-20)
                    } else {
                      assertThat(element.getValue(), containsInAnyOrder(2)); // window [0-10)
                    }
                  }
                  context.output(element);
                }
              }));
  pipeline.run();
}
 
Example #20
Source File: MultiStepCombine.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<K, OutputT>> expand(PCollection<KV<K, InputT>> input) {
  checkArgument(
      input.getCoder() instanceof KvCoder,
      "Expected input to have a %s of type %s, got %s",
      Coder.class.getSimpleName(),
      KvCoder.class.getSimpleName(),
      input.getCoder());
  KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) input.getCoder();
  Coder<InputT> inputValueCoder = inputCoder.getValueCoder();
  Coder<AccumT> accumulatorCoder;
  try {
    accumulatorCoder =
        combineFn.getAccumulatorCoder(input.getPipeline().getCoderRegistry(), inputValueCoder);
  } catch (CannotProvideCoderException e) {
    throw new IllegalStateException(
        String.format(
            "Could not construct an Accumulator Coder with the provided %s %s",
            CombineFn.class.getSimpleName(), combineFn),
        e);
  }
  return input
      .apply(
          ParDo.of(
              new CombineInputs<>(
                  combineFn,
                  input.getWindowingStrategy().getTimestampCombiner(),
                  inputCoder.getKeyCoder())))
      .setCoder(KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder))
      .apply(GroupByKey.create())
      .apply(new MergeAndExtractAccumulatorOutput<>(combineFn, outputCoder));
}
 
Example #21
Source File: DirectGroupByKey.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KeyedWorkItem<K, V>> expand(PCollection<KV<K, V>> input) {
  return PCollection.createPrimitiveOutputInternal(
      input.getPipeline(),
      WindowingStrategy.globalDefault(),
      input.isBounded(),
      KeyedWorkItemCoder.of(
          GroupByKey.getKeyCoder(input.getCoder()),
          GroupByKey.getInputValueCoder(input.getCoder()),
          input.getWindowingStrategy().getWindowFn().windowCoder()));
}
 
Example #22
Source File: ViewOverrideFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<ElemT> expand(final PCollection<ElemT> input) {
  input
      .apply(WithKeys.of((Void) null))
      .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder()))
      .apply(GroupByKey.create())
      .apply(Values.create())
      .apply(new WriteView<>(view));
  return input;
}
 
Example #23
Source File: DataflowPTransformMatchersTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Creates a simple pipeline with a {@link Combine.GroupedValues}. */
private static TestPipeline createCombineGroupedValuesPipeline() {
  TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false);
  PCollection<KV<String, Integer>> input =
      pipeline
          .apply(Create.of(KV.of("key", 1)))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()));
  input.apply(GroupByKey.create()).apply(Combine.groupedValues(new SumCombineFn()));

  return pipeline;
}
 
Example #24
Source File: ViewEvaluatorFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testInMemoryEvaluator() throws Exception {
  PCollection<String> input = p.apply(Create.of("foo", "bar"));
  PCollectionView<Iterable<String>> pCollectionView = input.apply(View.asIterable());
  PCollection<Iterable<String>> concat =
      input
          .apply(WithKeys.of((Void) null))
          .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of()))
          .apply(GroupByKey.create())
          .apply(Values.create());
  PCollection<Iterable<String>> view =
      concat.apply(new ViewOverrideFactory.WriteView<>(pCollectionView));

  EvaluationContext context = mock(EvaluationContext.class);
  TestViewWriter<String, Iterable<String>> viewWriter = new TestViewWriter<>();
  when(context.createPCollectionViewWriter(concat, pCollectionView)).thenReturn(viewWriter);

  CommittedBundle<String> inputBundle = bundleFactory.createBundle(input).commit(Instant.now());
  AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(view);
  TransformEvaluator<Iterable<String>> evaluator =
      new ViewEvaluatorFactory(context).forApplication(producer, inputBundle);

  evaluator.processElement(WindowedValue.valueInGlobalWindow(ImmutableList.of("foo", "bar")));
  assertThat(viewWriter.latest, nullValue());

  evaluator.finishBundle();
  assertThat(
      viewWriter.latest,
      containsInAnyOrder(
          WindowedValue.valueInGlobalWindow("foo"), WindowedValue.valueInGlobalWindow("bar")));
}
 
Example #25
Source File: KafkaExactlyOnceSink.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Void> expand(PCollection<ProducerRecord<K, V>> input) {

  int numShards = spec.getNumShards();
  if (numShards <= 0) {
    try (Consumer<?, ?> consumer = openConsumer(spec)) {
      numShards = consumer.partitionsFor(spec.getTopic()).size();
      LOG.info(
          "Using {} shards for exactly-once writer, matching number of partitions "
              + "for topic '{}'",
          numShards,
          spec.getTopic());
    }
  }
  checkState(numShards > 0, "Could not set number of shards");

  return input
      .apply(
          Window.<ProducerRecord<K, V>>into(new GlobalWindows()) // Everything into global window.
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .discardingFiredPanes())
      .apply(
          String.format("Shuffle across %d shards", numShards),
          ParDo.of(new Reshard<>(numShards)))
      .apply("Persist sharding", GroupByKey.create())
      .apply("Assign sequential ids", ParDo.of(new Sequencer<>()))
      .apply("Persist ids", GroupByKey.create())
      .apply(
          String.format("Write to Kafka topic '%s'", spec.getTopic()),
          ParDo.of(new ExactlyOnceWriter<>(spec, input.getCoder())));
}
 
Example #26
Source File: StreamingTransformTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
  return new TransformEvaluator<GroupByKey<K, V>>() {
    @Override
    public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked")
      UnboundedDataset<KV<K, V>> inputDataset =
          (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
      List<Integer> streamSources = inputDataset.getStreamSources();
      JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
      final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
      @SuppressWarnings("unchecked")
      final WindowingStrategy<?, W> windowingStrategy =
          (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();

      // --- coders.
      final WindowedValue.WindowedValueCoder<V> wvCoder =
          WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());

      JavaDStream<WindowedValue<KV<K, Iterable<V>>>> outStream =
          SparkGroupAlsoByWindowViaWindowSet.groupByKeyAndWindow(
              dStream,
              coder.getKeyCoder(),
              wvCoder,
              windowingStrategy,
              context.getSerializableOptions(),
              streamSources,
              context.getCurrentTransform().getFullName());

      context.putDataset(transform, new UnboundedDataset<>(outStream, streamSources));
    }

    @Override
    public String toNativeString() {
      return "groupByKey()";
    }
  };
}
 
Example #27
Source File: GroupByKeyTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that the translator is registered so the URN can be retrieved (the only thing you can
 * meaningfully do with a {@link GroupByKey}).
 */
@Test
public void testUrnRetrievable() throws Exception {
  assertThat(
      PTransformTranslation.urnForTransform(GroupByKey.create()),
      equalTo(GROUP_BY_KEY_TRANSFORM_URN));
}
 
Example #28
Source File: PipelineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Parameters(name = "{index}")
public static Iterable<Pipeline> testPipelines() {
  Pipeline trivialPipeline = Pipeline.create();
  trivialPipeline.apply(Create.of(1, 2, 3));

  Pipeline sideInputPipeline = Pipeline.create();
  final PCollectionView<String> singletonView =
      sideInputPipeline.apply(Create.of("foo")).apply(View.asSingleton());
  sideInputPipeline
      .apply(Create.of("main input"))
      .apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void process(ProcessContext c) {
                      // actually never executed and no effect on translation
                      c.sideInput(singletonView);
                    }
                  })
              .withSideInputs(singletonView));

  Pipeline complexPipeline = Pipeline.create();
  BigEndianLongCoder customCoder = BigEndianLongCoder.of();
  PCollection<Long> elems = complexPipeline.apply(GenerateSequence.from(0L).to(207L));
  PCollection<Long> counted = elems.apply(Count.globally()).setCoder(customCoder);
  PCollection<Long> windowed =
      counted.apply(
          Window.<Long>into(FixedWindows.of(Duration.standardMinutes(7)))
              .triggering(
                  AfterWatermark.pastEndOfWindow()
                      .withLateFirings(AfterPane.elementCountAtLeast(19)))
              .accumulatingFiredPanes()
              .withAllowedLateness(Duration.standardMinutes(3L)));
  final WindowingStrategy<?, ?> windowedStrategy = windowed.getWindowingStrategy();
  PCollection<KV<String, Long>> keyed = windowed.apply(WithKeys.of("foo"));
  PCollection<KV<String, Iterable<Long>>> grouped = keyed.apply(GroupByKey.create());

  return ImmutableList.of(trivialPipeline, sideInputPipeline, complexPipeline);
}
 
Example #29
Source File: GroupByKeyTranslatorBatch.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(GroupByKey<K, V> transform, Twister2BatchTranslationContext context) {
  PCollection<KV<K, V>> input = context.getInput(transform);
  BatchTSetImpl<WindowedValue<KV<K, V>>> inputTTSet = context.getInputDataSet(input);
  final KvCoder<K, V> coder = (KvCoder<K, V>) input.getCoder();
  Coder<K> inputKeyCoder = coder.getKeyCoder();
  WindowingStrategy windowingStrategy = input.getWindowingStrategy();
  WindowFn<KV<K, V>, BoundedWindow> windowFn =
      (WindowFn<KV<K, V>, BoundedWindow>) windowingStrategy.getWindowFn();
  final WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
  KeyedTSet<byte[], byte[]> keyedTSet =
      inputTTSet.mapToTuple(new MapToTupleFunction<K, V>(inputKeyCoder, wvCoder));

  // todo add support for a partition function to be specified, this would use
  // todo keyedPartition function instead of KeyedGather
  ComputeTSet<KV<K, Iterable<WindowedValue<V>>>, Iterator<Tuple<byte[], Iterator<byte[]>>>>
      groupedbyKeyTset =
          keyedTSet.keyedGather().map(new ByteToWindowFunction(inputKeyCoder, wvCoder));

  // --- now group also by window.
  SystemReduceFnBuffering reduceFnBuffering = new SystemReduceFnBuffering(coder.getValueCoder());
  ComputeTSet<WindowedValue<KV<K, Iterable<V>>>, Iterable<KV<K, Iterator<WindowedValue<V>>>>>
      outputTset =
          groupedbyKeyTset
              .direct()
              .<WindowedValue<KV<K, Iterable<V>>>>flatmap(
                  new GroupByWindowFunction(
                      windowingStrategy, reduceFnBuffering, context.getOptions()));
  PCollection output = context.getOutput(transform);
  context.setOutputDataSet(output, outputTset);
}
 
Example #30
Source File: BreakFusionTransform.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  return input
      .apply("Break fusion mapper", ParDo.of(new DummyMapFn<T>()))
      .apply(GroupByKey.<T, Integer>create())
      .apply(Keys.<T>create());
}