org.apache.beam.sdk.transforms.WithKeys Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.WithKeys. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DataflowRunner.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<ValueWithRecordId<T>> input) {
  return input
      .apply(
          WithKeys.of(
                  (ValueWithRecordId<T> value) ->
                      Arrays.hashCode(value.getId()) % NUM_RESHARD_KEYS)
              .withKeyType(TypeDescriptors.integers()))
      // Reshuffle will dedup based on ids in ValueWithRecordId by passing the data through
      // WindmillSink.
      .apply(Reshuffle.of())
      .apply(
          "StripIds",
          ParDo.of(
              new DoFn<KV<Integer, ValueWithRecordId<T>>, T>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(c.element().getValue().getValue());
                }
              }));
}
 
Example #2
Source File: TestStreamTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({NeedsRunner.class, UsesTestStream.class})
public void testElementsAtAlmostPositiveInfinity() {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  TestStream<String> stream =
      TestStream.create(StringUtf8Coder.of())
          .addElements(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(stream)
          .apply(into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(endOfGlobalWindow))
      .containsInAnyOrder("foo", "bar");
  p.run();
}
 
Example #3
Source File: CloningBundleFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void bundleWorkingCoderSucceedsClonesOutput() {
  PCollection<Integer> created = p.apply(Create.of(1, 3).withCoder(VarIntCoder.of()));
  PCollection<KV<String, Integer>> kvs =
      created
          .apply(WithKeys.of("foo"))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()));
  WindowedValue<KV<String, Integer>> fooOne = WindowedValue.valueInGlobalWindow(KV.of("foo", 1));
  WindowedValue<KV<String, Integer>> fooThree =
      WindowedValue.valueInGlobalWindow(KV.of("foo", 3));
  CommittedBundle<KV<String, Integer>> bundle =
      factory.createBundle(kvs).add(fooOne).add(fooThree).commit(Instant.now());

  assertThat(bundle.getElements(), containsInAnyOrder(fooOne, fooThree));
  assertThat(
      bundle.getElements(), not(containsInAnyOrder(theInstance(fooOne), theInstance(fooThree))));
  for (WindowedValue<KV<String, Integer>> foo : bundle.getElements()) {
    assertThat(
        foo.getValue(),
        not(anyOf(theInstance(fooOne.getValue()), theInstance(fooThree.getValue()))));
  }
  assertThat(bundle.getPCollection(), equalTo(kvs));
}
 
Example #4
Source File: CloningBundleFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void keyedBundleWorkingCoderSucceedsClonesOutput() {
  PCollection<Integer> created = p.apply(Create.of(1, 3).withCoder(VarIntCoder.of()));

  PCollection<KV<String, Iterable<Integer>>> keyed =
      created
          .apply(WithKeys.of("foo"))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))
          .apply(GroupByKey.create());
  WindowedValue<KV<String, Iterable<Integer>>> foos =
      WindowedValue.valueInGlobalWindow(
          KV.<String, Iterable<Integer>>of("foo", ImmutableList.of(1, 3)));
  CommittedBundle<KV<String, Iterable<Integer>>> keyedBundle =
      factory
          .createKeyedBundle(StructuralKey.of("foo", StringUtf8Coder.of()), keyed)
          .add(foos)
          .commit(Instant.now());

  assertThat(keyedBundle.getElements(), containsInAnyOrder(foos));
  assertThat(
      Iterables.getOnlyElement(keyedBundle.getElements()).getValue(),
      not(theInstance(foos.getValue())));
  assertThat(keyedBundle.getPCollection(), equalTo(keyed));
  assertThat(keyedBundle.getKey(), equalTo(StructuralKey.of("foo", StringUtf8Coder.of())));
}
 
Example #5
Source File: WriteFiles.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<List<ResultT>> expand(PCollection<ResultT> input) {
  if (getWindowedWrites()) {
    // Reshuffle the results to make them stable against retries.
    // Use a single void key to maximize size of bundles for finalization.
    return input
        .apply("Add void key", WithKeys.of((Void) null))
        .apply("Reshuffle", Reshuffle.of())
        .apply("Drop key", Values.create())
        .apply("Gather bundles", ParDo.of(new GatherBundlesPerWindowFn<>()))
        .setCoder(ListCoder.of(resultCoder))
        // Reshuffle one more time to stabilize the contents of the bundle lists to finalize.
        .apply(Reshuffle.viaRandomKey());
  } else {
    // Pass results via a side input rather than reshuffle, because we need to get an empty
    // iterable to finalize if there are no results.
    return input
        .getPipeline()
        .apply(Reify.viewInGlobalWindow(input.apply(View.asList()), ListCoder.of(resultCoder)));
  }
}
 
Example #6
Source File: BigQueryToTableIT.java    From beam with Apache License 2.0 6 votes vote down vote up
private void runBigQueryToTablePipeline(BigQueryToTableOptions options) {
  Pipeline p = Pipeline.create(options);
  BigQueryIO.Read bigQueryRead = BigQueryIO.read().fromQuery(options.getQuery());
  if (options.getUsingStandardSql()) {
    bigQueryRead = bigQueryRead.usingStandardSql();
  }
  PCollection<TableRow> input = p.apply(bigQueryRead);
  if (options.getReshuffle()) {
    input =
        input
            .apply(WithKeys.<Void, TableRow>of((Void) null))
            .setCoder(KvCoder.of(VoidCoder.of(), TableRowJsonCoder.of()))
            .apply(Reshuffle.<Void, TableRow>of())
            .apply(Values.<TableRow>create());
  }
  input.apply(
      BigQueryIO.writeTableRows()
          .to(options.getOutput())
          .withSchema(options.getOutputSchema())
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  p.run().waitUntilFinish();
}
 
Example #7
Source File: GatherAllPanes.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Iterable<ValueInSingleWindow<T>>> expand(PCollection<T> input) {
  WindowFn<?, ?> originalWindowFn = input.getWindowingStrategy().getWindowFn();

  return input
      .apply(Reify.windows())
      .apply(
          WithKeys.<Integer, ValueInSingleWindow<T>>of(0)
              .withKeyType(new TypeDescriptor<Integer>() {}))
      .apply(
          Window.into(
                  new IdentityWindowFn<KV<Integer, ValueInSingleWindow<T>>>(
                      originalWindowFn.windowCoder()))
              .triggering(Never.ever())
              .withAllowedLateness(input.getWindowingStrategy().getAllowedLateness())
              .discardingFiredPanes())
      // all values have the same key so they all appear as a single output element
      .apply(GroupByKey.create())
      .apply(Values.create())
      .setWindowingStrategyInternal(input.getWindowingStrategy());
}
 
Example #8
Source File: CreateStreamTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testElementsAtAlmostPositiveInfinity() throws IOException {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  CreateStream<String> source =
      CreateStream.of(StringUtf8Coder.of(), batchDuration())
          .nextBatch(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceNextBatchWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(source)
          .apply(Window.into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(GlobalWindow.INSTANCE.maxTimestamp()))
      .containsInAnyOrder("foo", "bar");
  p.run();
}
 
Example #9
Source File: AnnotateImages.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Applies all necessary transforms to call the Vision API. In order to group requests into
 * batches, we assign keys to the requests, as {@link GroupIntoBatches} works only on {@link KV}s.
 */
@Override
public PCollection<List<AnnotateImageResponse>> expand(PCollection<T> input) {
  ParDo.SingleOutput<T, AnnotateImageRequest> inputToRequestMapper;
  if (contextSideInput != null) {
    inputToRequestMapper =
        ParDo.of(new MapInputToRequest(contextSideInput)).withSideInputs(contextSideInput);
  } else {
    inputToRequestMapper = ParDo.of(new MapInputToRequest(null));
  }
  return input
      .apply(inputToRequestMapper)
      .apply(
          WithKeys.of(
                  (SerializableFunction<AnnotateImageRequest, Integer>)
                      ignored -> new Random().nextInt(desiredRequestParallelism))
              .withKeyType(TypeDescriptors.integers()))
      .apply(GroupIntoBatches.ofSize(batchSize))
      .apply(ParDo.of(new PerformImageAnnotation()));
}
 
Example #10
Source File: ApproximateDistinctTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void perKey() {
  final int cardinality = 1000;
  final int p = 15;
  final double expectedErr = 1.04 / Math.sqrt(p);

  List<Integer> stream = new ArrayList<>();
  for (int i = 1; i <= cardinality; i++) {
    stream.addAll(Collections.nCopies(2, i));
  }
  Collections.shuffle(stream);

  PCollection<Long> results =
      tp.apply("per key stream", Create.of(stream))
          .apply("create keys", WithKeys.of(1))
          .apply(
              "per key cardinality",
              ApproximateDistinct.<Integer, Integer>perKey().withPrecision(p))
          .apply("extract values", Values.create());

  PAssert.that("Verify Accuracy for cardinality per key", results)
      .satisfies(new VerifyAccuracy(cardinality, expectedErr));

  tp.run();
}
 
Example #11
Source File: DirectTransformExecutorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Before
public void setup() {
  MockitoAnnotations.initMocks(this);

  bundleFactory = ImmutableListBundleFactory.create();

  transformEvaluationState =
      TransformExecutorServices.parallel(MoreExecutors.newDirectExecutorService());

  evaluatorCompleted = new CountDownLatch(1);
  completionCallback = new RegisteringCompletionCallback(evaluatorCompleted);

  created = p.apply(Create.of("foo", "spam", "third"));
  PCollection<KV<Integer, String>> downstream = created.apply(WithKeys.of(3));

  DirectGraphs.performDirectOverrides(p);
  DirectGraph graph = DirectGraphs.getGraph(p);
  createdProducer = graph.getProducer(created);
  downstreamProducer = graph.getProducer(downstream);

  when(evaluationContext.getMetrics()).thenReturn(metrics);
}
 
Example #12
Source File: QueryablePipelineTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void getEnvironmentWithEnvironment() {
  Pipeline p = Pipeline.create();
  PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));
  longs.apply(WithKeys.of("a")).apply("groupByKey", GroupByKey.create());

  Components components = PipelineTranslation.toProto(p).getComponents();
  QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);

  PTransformNode environmentalRead =
      PipelineNode.pTransform("BoundedRead", components.getTransformsOrThrow("BoundedRead"));
  PTransformNode nonEnvironmentalTransform =
      PipelineNode.pTransform("groupByKey", components.getTransformsOrThrow("groupByKey"));

  assertThat(qp.getEnvironment(environmentalRead).isPresent(), is(true));
  assertThat(
      qp.getEnvironment(environmentalRead).get().getUrn(),
      equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getUrn()));
  assertThat(
      qp.getEnvironment(environmentalRead).get().getPayload(),
      equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getPayload()));
  assertThat(qp.getEnvironment(nonEnvironmentalTransform).isPresent(), is(false));
}
 
Example #13
Source File: BigQueryMergerTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Test
public void testAutoValueMergeInfoClass() throws Exception {
  MergeInfo mergeInfo =
      MergeInfo.create(
          TIMESTAMP_META_FIELD,
          DELETED_META_FIELD,
          TABLE_1,
          TABLE_2,
          FULL_COLUMN_LIST,
          PRIMARY_KEY_COLUMNS);

  PCollection<KV<String, MergeInfo>> result =
      pipeline
          .apply(Create.of(mergeInfo))
          .apply(
              WithKeys.<String, MergeInfo>of(mi -> mi.getReplicaTable())
                  .withKeyType(TypeDescriptors.strings()))
          .apply(
              new TriggerPerKeyOnFixedIntervals<>(Duration.standardMinutes(WINDOW_SIZE_MINUTES)));

  PAssert.that(result).containsInAnyOrder(KV.of(mergeInfo.getReplicaTable(), mergeInfo));
  pipeline.run().waitUntilFinish();
}
 
Example #14
Source File: PipelineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Parameters(name = "{index}")
public static Iterable<Pipeline> testPipelines() {
  Pipeline trivialPipeline = Pipeline.create();
  trivialPipeline.apply(Create.of(1, 2, 3));

  Pipeline sideInputPipeline = Pipeline.create();
  final PCollectionView<String> singletonView =
      sideInputPipeline.apply(Create.of("foo")).apply(View.asSingleton());
  sideInputPipeline
      .apply(Create.of("main input"))
      .apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void process(ProcessContext c) {
                      // actually never executed and no effect on translation
                      c.sideInput(singletonView);
                    }
                  })
              .withSideInputs(singletonView));

  Pipeline complexPipeline = Pipeline.create();
  BigEndianLongCoder customCoder = BigEndianLongCoder.of();
  PCollection<Long> elems = complexPipeline.apply(GenerateSequence.from(0L).to(207L));
  PCollection<Long> counted = elems.apply(Count.globally()).setCoder(customCoder);
  PCollection<Long> windowed =
      counted.apply(
          Window.<Long>into(FixedWindows.of(Duration.standardMinutes(7)))
              .triggering(
                  AfterWatermark.pastEndOfWindow()
                      .withLateFirings(AfterPane.elementCountAtLeast(19)))
              .accumulatingFiredPanes()
              .withAllowedLateness(Duration.standardMinutes(3L)));
  final WindowingStrategy<?, ?> windowedStrategy = windowed.getWindowingStrategy();
  PCollection<KV<String, Long>> keyed = windowed.apply(WithKeys.of("foo"));
  PCollection<KV<String, Iterable<Long>>> grouped = keyed.apply(GroupByKey.create());

  return ImmutableList.of(trivialPipeline, sideInputPipeline, complexPipeline);
}
 
Example #15
Source File: SplittableParDo.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<InputT> input) {
  Coder<RestrictionT> restrictionCoder =
      DoFnInvokers.invokerFor(doFn)
          .invokeGetRestrictionCoder(input.getPipeline().getCoderRegistry());
  Coder<WatermarkEstimatorStateT> watermarkEstimatorStateCoder =
      DoFnInvokers.invokerFor(doFn)
          .invokeGetWatermarkEstimatorStateCoder(input.getPipeline().getCoderRegistry());
  Coder<KV<InputT, RestrictionT>> splitCoder = KvCoder.of(input.getCoder(), restrictionCoder);

  PCollection<KV<byte[], KV<InputT, RestrictionT>>> keyedRestrictions =
      input
          .apply(
              "Pair with initial restriction",
              ParDo.of(new PairWithRestrictionFn<InputT, OutputT, RestrictionT>(doFn)))
          .setCoder(splitCoder)
          .apply("Split restriction", ParDo.of(new SplitRestrictionFn<>(doFn)))
          .setCoder(splitCoder)
          // ProcessFn requires all input elements to be in a single window and have a single
          // element per work item. This must precede the unique keying so each key has a single
          // associated element.
          .apply("Explode windows", ParDo.of(new ExplodeWindowsFn<>()))
          .apply("Assign unique key", WithKeys.of(new RandomUniqueKeyFn<>()));

  return keyedRestrictions.apply(
      "ProcessKeyedElements",
      new ProcessKeyedElements<>(
          doFn,
          input.getCoder(),
          restrictionCoder,
          watermarkEstimatorStateCoder,
          (WindowingStrategy<InputT, ?>) input.getWindowingStrategy(),
          sideInputs,
          mainOutputTag,
          additionalOutputTags,
          outputTagsToCoders));
}
 
Example #16
Source File: ViewEvaluatorFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testInMemoryEvaluator() throws Exception {
  PCollection<String> input = p.apply(Create.of("foo", "bar"));
  PCollectionView<Iterable<String>> pCollectionView = input.apply(View.asIterable());
  PCollection<Iterable<String>> concat =
      input
          .apply(WithKeys.of((Void) null))
          .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of()))
          .apply(GroupByKey.create())
          .apply(Values.create());
  PCollection<Iterable<String>> view =
      concat.apply(new ViewOverrideFactory.WriteView<>(pCollectionView));

  EvaluationContext context = mock(EvaluationContext.class);
  TestViewWriter<String, Iterable<String>> viewWriter = new TestViewWriter<>();
  when(context.createPCollectionViewWriter(concat, pCollectionView)).thenReturn(viewWriter);

  CommittedBundle<String> inputBundle = bundleFactory.createBundle(input).commit(Instant.now());
  AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(view);
  TransformEvaluator<Iterable<String>> evaluator =
      new ViewEvaluatorFactory(context).forApplication(producer, inputBundle);

  evaluator.processElement(WindowedValue.valueInGlobalWindow(ImmutableList.of("foo", "bar")));
  assertThat(viewWriter.latest, nullValue());

  evaluator.finishBundle();
  assertThat(
      viewWriter.latest,
      containsInAnyOrder(
          WindowedValue.valueInGlobalWindow("foo"), WindowedValue.valueInGlobalWindow("bar")));
}
 
Example #17
Source File: SideInputContainerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() {
  MockitoAnnotations.initMocks(this);

  PCollection<Integer> create = pipeline.apply("forBaseCollection", Create.of(1, 2, 3, 4));

  mapView = create.apply("forKeyTypes", WithKeys.of("foo")).apply("asMapView", View.asMap());

  singletonView = create.apply("forCombinedTypes", Mean.<Integer>globally().asSingletonView());
  iterableView = create.apply("asIterableView", View.asIterable());

  container =
      SideInputContainer.create(context, ImmutableList.of(iterableView, mapView, singletonView));
}
 
Example #18
Source File: TestPubsubSignal.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public POutput expand(PCollection<? extends T> input) {
  return input
      // assign a dummy key and global window,
      // this is needed to accumulate all observed events in the same state cell
      .apply(Window.into(new GlobalWindows()))
      .apply(WithKeys.of("dummyKey"))
      .apply(
          "checkAllEventsForSuccess",
          ParDo.of(new StatefulPredicateCheck<>(coder, formatter, successPredicate)))
      // signal the success/failure to the result topic
      .apply("publishSuccess", PubsubIO.writeStrings().to(resultTopicPath.getPath()));
}
 
Example #19
Source File: WriteTables.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<TableDestination, String>> expand(
    PCollection<KV<ShardedKey<DestinationT>, List<String>>> input) {
  PCollectionTuple writeTablesOutputs =
      input.apply(
          ParDo.of(new WriteTablesDoFn())
              .withSideInputs(sideInputs)
              .withOutputTags(mainOutputTag, TupleTagList.of(temporaryFilesTag)));

  // Garbage collect temporary files.
  // We mustn't start garbage collecting files until we are assured that the WriteTablesDoFn has
  // succeeded in loading those files and won't be retried. Otherwise, we might fail part of the
  // way through deleting temporary files, and retry WriteTablesDoFn. This will then fail due
  // to missing files, causing either the entire workflow to fail or get stuck (depending on how
  // the runner handles persistent failures).
  writeTablesOutputs
      .get(temporaryFilesTag)
      .setCoder(StringUtf8Coder.of())
      .apply(WithKeys.of((Void) null))
      .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of()))
      .apply(
          Window.<KV<Void, String>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .discardingFiredPanes())
      .apply(GroupByKey.create())
      .apply(Values.create())
      .apply(ParDo.of(new GarbageCollectTemporaryFiles()));

  return writeTablesOutputs.get(mainOutputTag);
}
 
Example #20
Source File: ViewOverrideFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<ElemT> expand(final PCollection<ElemT> input) {
  input
      .apply(WithKeys.of((Void) null))
      .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder()))
      .apply(GroupByKey.create())
      .apply(Values.create())
      .apply(new WriteView<>(view));
  return input;
}
 
Example #21
Source File: GatherAllPanesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void singlePaneSingleReifiedPane() {
  PCollection<Iterable<ValueInSingleWindow<Iterable<Long>>>> accumulatedPanes =
      p.apply(GenerateSequence.from(0).to(20000))
          .apply(WithTimestamps.of(input -> new Instant(input * 10)))
          .apply(
              Window.<Long>into(FixedWindows.of(Duration.standardMinutes(1)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .withAllowedLateness(Duration.ZERO)
                  .discardingFiredPanes())
          .apply(WithKeys.<Void, Long>of((Void) null).withKeyType(new TypeDescriptor<Void>() {}))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(GatherAllPanes.globally());

  PAssert.that(accumulatedPanes)
      .satisfies(
          input -> {
            for (Iterable<ValueInSingleWindow<Iterable<Long>>> windowedInput : input) {
              if (Iterables.size(windowedInput) > 1) {
                fail("Expected all windows to have exactly one pane, got " + windowedInput);
                return null;
              }
            }
            return null;
          });

  p.run();
}
 
Example #22
Source File: WatermarkManagerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() {

  createdInts = p.apply("createdInts", Create.of(1, 2, 3));

  filtered = createdInts.apply("filtered", Filter.greaterThan(1));
  filteredTimesTwo =
      filtered.apply(
          "timesTwo",
          ParDo.of(
              new DoFn<Integer, Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) throws Exception {
                  c.output(c.element() * 2);
                }
              }));

  keyed = createdInts.apply("keyed", WithKeys.of("MyKey"));

  intsToFlatten = p.apply("intsToFlatten", Create.of(-1, 256, 65535));
  PCollectionList<Integer> preFlatten = PCollectionList.of(createdInts).and(intsToFlatten);
  flattened = preFlatten.apply("flattened", Flatten.pCollections());

  clock = MockClock.fromInstant(new Instant(1000));
  DirectGraphs.performDirectOverrides(p);
  graph = DirectGraphs.getGraph(p);

  manager = WatermarkManager.create(clock, graph, AppliedPTransform::getFullName);
  bundleFactory = ImmutableListBundleFactory.create();
}
 
Example #23
Source File: PAssertTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Basic test for {@code isEqualTo}. */
@Test
@Category({
  ValidatesRunner.class,
  UsesStatefulParDo.class // This test fails if State is unsupported despite no direct usage.
})
public void testWindowedIsEqualTo() throws Exception {
  PCollection<Integer> pcollection =
      pipeline
          .apply(
              Create.timestamped(
                  TimestampedValue.of(43, new Instant(250L)),
                  TimestampedValue.of(22, new Instant(-250L))))
          .apply(Window.into(FixedWindows.of(Duration.millis(500L))))
          // Materialize final panes to be able to check for single element ON_TIME panes,
          // elements might be in EARLY panes otherwise.
          .apply(WithKeys.of(0))
          .apply(GroupByKey.create())
          .apply(
              ParDo.of(
                  new DoFn<KV<Integer, Iterable<Integer>>, Integer>() {
                    @ProcessElement
                    public void processElement(ProcessContext ctxt) {
                      for (Integer integer : ctxt.element().getValue()) {
                        ctxt.output(integer);
                      }
                    }
                  }));

  PAssert.thatSingleton(pcollection)
      .inOnlyPane(new IntervalWindow(new Instant(0L), new Instant(500L)))
      .isEqualTo(43);
  PAssert.thatSingleton(pcollection)
      .inOnlyPane(new IntervalWindow(new Instant(-500L), new Instant(0L)))
      .isEqualTo(22);
  pipeline.run();
}
 
Example #24
Source File: GatherAllPanesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void multiplePanesMultipleReifiedPane() {
  PCollection<Long> someElems = p.apply("someLongs", GenerateSequence.from(0).to(20000));
  PCollection<Long> otherElems = p.apply("otherLongs", GenerateSequence.from(0).to(20000));
  PCollection<Iterable<ValueInSingleWindow<Iterable<Long>>>> accumulatedPanes =
      PCollectionList.of(someElems)
          .and(otherElems)
          .apply(Flatten.pCollections())
          .apply(WithTimestamps.of(input -> new Instant(input * 10)))
          .apply(
              Window.<Long>into(FixedWindows.of(Duration.standardMinutes(1)))
                  .triggering(
                      AfterWatermark.pastEndOfWindow()
                          .withEarlyFirings(AfterPane.elementCountAtLeast(1)))
                  .withAllowedLateness(Duration.ZERO)
                  .discardingFiredPanes())
          .apply(WithKeys.<Void, Long>of((Void) null).withKeyType(new TypeDescriptor<Void>() {}))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(GatherAllPanes.globally());

  PAssert.that(accumulatedPanes)
      .satisfies(
          input -> {
            for (Iterable<ValueInSingleWindow<Iterable<Long>>> windowedInput : input) {
              if (Iterables.size(windowedInput) > 1) {
                return null;
              }
            }
            fail("Expected at least one window to have multiple panes");
            return null;
          });

  p.run();
}
 
Example #25
Source File: Group.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<Row, Iterable<Row>>> expand(PCollection<InputT> input) {
  Schema schema = input.getSchema();
  FieldAccessDescriptor resolved = getFieldAccessDescriptor().resolve(schema);
  rowSelector = new RowSelectorContainer(schema, resolved, true);
  Schema keySchema = getKeySchema(schema);

  return input
      .apply("toRow", Convert.toRows())
      .apply(
          "selectKeys",
          WithKeys.of((Row e) -> rowSelector.select(e)).withKeyType(TypeDescriptors.rows()))
      .setCoder(KvCoder.of(SchemaCoder.of(keySchema), SchemaCoder.of(schema)))
      .apply("GroupByKey", GroupByKey.create());
}
 
Example #26
Source File: Group.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Iterable<InputT>> expand(PCollection<InputT> input) {
  return input
      .apply("addNullKey", WithKeys.of((Void) null))
      .apply("group", GroupByKey.create())
      .apply("extractValues", Values.create());
}
 
Example #27
Source File: BeamSortRel.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  Coder<T> coder = input.getCoder();
  PCollection<KV<String, T>> keyedRow =
      input.apply(WithKeys.of("DummyKey")).setCoder(KvCoder.of(StringUtf8Coder.of(), coder));

  return keyedRow.apply(ParDo.of(new LimitFn<T>(getCount(), startIndex)));
}
 
Example #28
Source File: SketchFrequenciesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void perKeyDefault() {
  PCollection<Long> stream = tp.apply(Create.of(smallStream));
  PCollection<Sketch<Long>> sketch =
      stream.apply(WithKeys.of(1)).apply(SketchFrequencies.perKey()).apply(Values.create());

  Coder<Long> coder = stream.getCoder();

  PAssert.thatSingleton("Verify number of hits", sketch)
      .satisfies(new VerifyStreamFrequencies<>(coder, distinctElems, frequencies));

  tp.run();
}
 
Example #29
Source File: TDigestQuantilesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void perKey() {
  PCollection<KV<Double, Double>> col =
      tp.apply(Create.of(stream))
          .apply(WithKeys.of(1))
          .apply(TDigestQuantiles.<Integer>perKey().withCompression(compression))
          .apply(Values.create())
          .apply(ParDo.of(new RetrieveQuantiles(quantiles)));

  PAssert.that("Verify Accuracy", col).satisfies(new VerifyAccuracy());

  tp.run();
}
 
Example #30
Source File: TfIdf.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<URI, String>> expand(PBegin input) {
  Pipeline pipeline = input.getPipeline();

  // Create one TextIO.Read transform for each document
  // and add its output to a PCollectionList
  PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline);

  // TextIO.Read supports:
  //  - file: URIs and paths locally
  //  - gs: URIs on the service
  for (final URI uri : uris) {
    String uriString;
    if ("file".equals(uri.getScheme())) {
      uriString = new File(uri).getPath();
    } else {
      uriString = uri.toString();
    }

    PCollection<KV<URI, String>> oneUriToLines =
        pipeline
            .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString))
            .apply("WithKeys(" + uriString + ")", WithKeys.of(uri))
            .setCoder(KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of()));

    urisToLines = urisToLines.and(oneUriToLines);
  }

  return urisToLines.apply(Flatten.pCollections());
}