Java Code Examples for org.apache.beam.sdk.values.PCollectionList#apply()

The following examples show how to use org.apache.beam.sdk.values.PCollectionList#apply() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: EmptyFlattenAsCreateFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testOverride() {
  PCollectionList<Long> empty = PCollectionList.empty(pipeline);
  PCollection<Long> emptyFlattened =
      empty.apply(
          factory
              .getReplacementTransform(
                  AppliedPTransform.of(
                      "nonEmptyInput",
                      Collections.emptyMap(),
                      Collections.emptyMap(),
                      Flatten.pCollections(),
                      pipeline))
              .getTransform());
  PAssert.that(emptyFlattened).empty();
  pipeline.run();
}
 
Example 2
Source File: OperatorTransform.java    From beam with Apache License 2.0 6 votes vote down vote up
public static <InputT, OutputT, OperatorT extends Operator<OutputT>> PCollection<OutputT> apply(
    OperatorT operator, PCollectionList<InputT> inputs) {

  final Optional<OperatorTranslator<InputT, OutputT, OperatorT>> maybeTranslator =
      TranslatorProvider.of(inputs.getPipeline()).findTranslator(operator);

  if (maybeTranslator.isPresent()) {
    final PCollection<OutputT> output =
        inputs.apply(
            operator.getName().orElseGet(() -> operator.getClass().getName()),
            new OperatorTransform<>(operator, maybeTranslator.orElse(null)));
    Preconditions.checkState(
        output.getTypeDescriptor() != null,
        "Translator should always return a typed PCollection.");
    return output;
  }

  throw new IllegalStateException(
      "Unable to find translator for basic operator ["
          + operator.getClass()
          + "] with name ["
          + operator.getName().orElse(null)
          + "]");
}
 
Example 3
Source File: PartitionTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testDroppedPartition() {

  // Compute the set of integers either 1 or 2 mod 3, the hard way.
  PCollectionList<Integer> outputs =
      pipeline
          .apply(Create.of(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
          .apply(Partition.of(3, new ModFn()));

  List<PCollection<Integer>> outputsList = new ArrayList<>(outputs.getAll());
  outputsList.remove(0);
  outputs = PCollectionList.of(outputsList);
  assertTrue(outputs.size() == 2);

  PCollection<Integer> output = outputs.apply(Flatten.pCollections());
  PAssert.that(output).containsInAnyOrder(2, 4, 5, 7, 8, 10, 11);
  pipeline.run();
}
 
Example 4
Source File: DeduplicatedFlattenFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void duplicatesInsertsMultipliers() {
  PTransform<PCollectionList<String>, PCollection<String>> replacement =
      new DeduplicatedFlattenFactory.FlattenWithoutDuplicateInputs<>();
  final PCollectionList<String> inputList =
      PCollectionList.of(first).and(second).and(first).and(first);
  inputList.apply(replacement);
  pipeline.traverseTopologically(
      new Defaults() {
        @Override
        public void visitPrimitiveTransform(TransformHierarchy.Node node) {
          if (node.getTransform() instanceof Flatten.PCollections) {
            assertThat(node.getInputs(), not(equalTo(inputList.expand())));
          }
        }
      });
}
 
Example 5
Source File: DeduplicatedFlattenFactory.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PCollectionList<T> input) {
  Map<PCollection<T>, Integer> instances = new HashMap<>();
  for (PCollection<T> pCollection : input.getAll()) {
    int existing = instances.get(pCollection) == null ? 0 : instances.get(pCollection);
    instances.put(pCollection, existing + 1);
  }
  PCollectionList<T> output = PCollectionList.empty(input.getPipeline());
  for (Map.Entry<PCollection<T>, Integer> instanceEntry : instances.entrySet()) {
    if (instanceEntry.getValue().equals(1)) {
      output = output.and(instanceEntry.getKey());
    } else {
      String duplicationName = String.format("Multiply %s", instanceEntry.getKey().getName());
      PCollection<T> duplicated =
          instanceEntry
              .getKey()
              .apply(duplicationName, ParDo.of(new DuplicateFn<>(instanceEntry.getValue())));
      output = output.and(duplicated);
    }
  }
  return output.apply(Flatten.pCollections());
}
 
Example 6
Source File: FileIndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}
 
Example 7
Source File: CountingSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testUnboundedSourceRateSplits() throws Exception {
  int elementsPerPeriod = 10;
  Duration period = Duration.millis(5);

  long numElements = 1000;
  int numSplits = 10;

  UnboundedCountingSource initial =
      CountingSource.createUnboundedFrom(0).withRate(elementsPerPeriod, period);
  List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit)));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  Instant startTime = Instant.now();
  p.run();
  Instant endTime = Instant.now();
  // 500 ms if the readers are all initialized in parallel; 5000 ms if they are evaluated serially
  long expectedMinimumMillis = (numElements * period.getMillis()) / elementsPerPeriod;
  assertThat(expectedMinimumMillis, lessThan(endTime.getMillis() - startTime.getMillis()));
}
 
Example 8
Source File: WatermarkManagerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() {

  createdInts = p.apply("createdInts", Create.of(1, 2, 3));

  filtered = createdInts.apply("filtered", Filter.greaterThan(1));
  filteredTimesTwo =
      filtered.apply(
          "timesTwo",
          ParDo.of(
              new DoFn<Integer, Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) throws Exception {
                  c.output(c.element() * 2);
                }
              }));

  keyed = createdInts.apply("keyed", WithKeys.of("MyKey"));

  intsToFlatten = p.apply("intsToFlatten", Create.of(-1, 256, 65535));
  PCollectionList<Integer> preFlatten = PCollectionList.of(createdInts).and(intsToFlatten);
  flattened = preFlatten.apply("flattened", Flatten.pCollections());

  clock = MockClock.fromInstant(new Instant(1000));
  DirectGraphs.performDirectOverrides(p);
  graph = DirectGraphs.getGraph(p);

  manager = WatermarkManager.create(clock, graph, AppliedPTransform::getFullName);
  bundleFactory = ImmutableListBundleFactory.create();
}
 
Example 9
Source File: FlattenTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testFlatten() {
  PCollection<Integer> input1 = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
  PCollection<Integer> input2 = pipeline.apply(Create.of(11, 12, 13, 14, 15, 16, 17, 18, 19, 20));
  PCollectionList<Integer> pcs = PCollectionList.of(input1).and(input2);
  PCollection<Integer> input = pcs.apply(Flatten.pCollections());
  PAssert.that(input)
      .containsInAnyOrder(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
  pipeline.run();
}
 
Example 10
Source File: DeduplicatedFlattenFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void outputMapping() {
  final PCollectionList<String> inputList =
      PCollectionList.of(first).and(second).and(first).and(first);
  PCollection<String> original = inputList.apply(Flatten.pCollections());
  PCollection<String> replacement = inputList.apply(new FlattenWithoutDuplicateInputs<>());

  assertThat(
      factory.mapOutputs(original.expand(), replacement),
      Matchers.hasEntry(
          replacement,
          ReplacementOutput.of(
              TaggedPValue.ofExpandedValue(original),
              TaggedPValue.ofExpandedValue(replacement))));
}
 
Example 11
Source File: DeduplicatedFlattenFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testOverride() {
  final PCollectionList<String> inputList =
      PCollectionList.of(first).and(second).and(first).and(first);
  PTransform<PCollectionList<String>, PCollection<String>> replacement =
      new FlattenWithoutDuplicateInputs<>();
  PCollection<String> flattened = inputList.apply(replacement);

  PAssert.that(flattened).containsInAnyOrder("one", "two", "one", "one");
  pipeline.run();
}
 
Example 12
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnboundedSourceSplits() throws Exception {

  int numElements = 1000;
  int numSplits = 10;

  // Coders must be specified explicitly here due to the way the transform
  // is used in the test.
  UnboundedSource<KafkaRecord<Integer, Long>, ?> initial =
      mkKafkaReadTransform(numElements, null)
          .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of())
          .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of())
          .makeSource();

  List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits =
      initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))
                .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>()))
                .apply("collection " + i, Values.create()));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  p.run();
}
 
Example 13
Source File: FixedInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin begin) {
    FixedDatasetRuntime runtime = new FixedDatasetRuntime();
    runtime.initialize(null, properties.getDatasetProperties());

    // The values to include in the PCollection
    List<IndexedRecord> values = new LinkedList<>();

    if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.NONE
            || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND) {
        if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) {
            values.addAll(runtime.getValues(Integer.MAX_VALUE));
        }
    }

    if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND
            || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.REPLACE) {
        properties.getDatasetProperties().values.setValue(properties.overrideValues.getValue());
        if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) {
            values.addAll(runtime.getValues(Integer.MAX_VALUE));
        }
    }

    if (values.size() != 0) {
        PCollection<IndexedRecord> out = (PCollection<IndexedRecord>) begin
                .apply(Create.of(values).withCoder((AvroCoder) AvroCoder.of(runtime.getSchema())));
        if (properties.repeat.getValue() > 1) {
            PCollectionList<IndexedRecord> merged = PCollectionList.of(out);
            for (int i = 2; i < properties.repeat.getValue(); i++)
                merged = merged.and(out);
            out = merged.apply(Flatten.<IndexedRecord> pCollections());
        }
        return out;
    } else {
        return begin.apply(RowGeneratorIO.read().withSchema(runtime.getSchema()) //
                .withSeed(0L) //
                .withPartitions(1) //
                .withRows(properties.repeat.getValue()));
    }
}
 
Example 14
Source File: AutoComplete.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionList<KV<String, List<CompletionCandidate>>> expand(
    PCollection<CompletionCandidate> input) {
  if (minPrefix > 10) {
    // Base case, partitioning to return the output in the expected format.
    return input
        .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
        .apply(Partition.of(2, new KeySizePartitionFn()));
  } else {
    // If a candidate is in the top N for prefix a...b, it must also be in the top
    // N for a...bX for every X, which is typlically a much smaller set to consider.
    // First, compute the top candidate for prefixes of size at least minPrefix + 1.
    PCollectionList<KV<String, List<CompletionCandidate>>> larger =
        input.apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
    // Consider the top candidates for each prefix of length minPrefix + 1...
    PCollection<KV<String, List<CompletionCandidate>>> small =
        PCollectionList.of(larger.get(1).apply(ParDo.of(new FlattenTops())))
            // ...together with those (previously excluded) candidates of length
            // exactly minPrefix...
            .and(input.apply(Filter.by(c -> c.getValue().length() == minPrefix)))
            .apply("FlattenSmall", Flatten.pCollections())
            // ...set the key to be the minPrefix-length prefix...
            .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
            // ...and (re)apply the Top operator to all of them together.
            .apply(Top.largestPerKey(candidatesPerPrefix));

    PCollection<KV<String, List<CompletionCandidate>>> flattenLarger =
        larger.apply("FlattenLarge", Flatten.pCollections());

    return PCollectionList.of(flattenLarger).and(small);
  }
}
 
Example 15
Source File: TfIdf.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<URI, String>> expand(PBegin input) {
  Pipeline pipeline = input.getPipeline();

  // Create one TextIO.Read transform for each document
  // and add its output to a PCollectionList
  PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline);

  // TextIO.Read supports:
  //  - file: URIs and paths locally
  //  - gs: URIs on the service
  for (final URI uri : uris) {
    String uriString;
    if ("file".equals(uri.getScheme())) {
      uriString = new File(uri).getPath();
    } else {
      uriString = uri.toString();
    }

    PCollection<KV<URI, String>> oneUriToLines =
        pipeline
            .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString))
            .apply("WithKeys(" + uriString + ")", WithKeys.of(uri))
            .setCoder(KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of()));

    urisToLines = urisToLines.and(oneUriToLines);
  }

  return urisToLines.apply(Flatten.pCollections());
}
 
Example 16
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * @param contentToIndexNotSkipped
 * @param contentNotToIndexSkipped
 * @param pipeline
 * @param options
 * @return
 */
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
		PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
		Pipeline pipeline, IndexerPipelineOptions options) {
	PCollection<KV<String,Long>> alreadyProcessedDocs = null;
	
	if (!options.getWriteTruncate()) {
		String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
		alreadyProcessedDocs = pipeline
			.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
			.apply(ParDo.of(new GetDocumentHashFn()));

	} else {
		Map<String, Long> map = new HashMap<String,Long>();
		alreadyProcessedDocs = pipeline
			.apply("Create empty side input of Docs",
				Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
	}			
	
	final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =  
		alreadyProcessedDocs.apply(View.<String,Long>asMap());
	
	PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
		.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
		.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
		.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
			.withSideInputs(alreadyProcessedDocsSideInput)
			.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
				TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection	
	
	PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
	PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
	
	// Merge the sets of items that are dupes or skipped
	PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
	
	ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
	return content;
}
 
Example 17
Source File: PipelineTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<T> expand(PBegin input) {
  PCollectionList<T> empty = PCollectionList.empty(input.getPipeline());
  return empty.apply(Flatten.pCollections());
}
 
Example 18
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 4 votes vote down vote up
/**
 * @param bqrows
 * @param webresourceRowsUnindexed
 * @param webresourceDeduped
 * @param options
 */
private static void writeAllTablesToBigQuery(PCollectionTuple bqrows,
		PCollection<TableRow> webresourceRowsUnindexed, PCollection<TableRow> webresourceDeduped,
		IndexerPipelineOptions options) {
	PCollection<TableRow> webresourceRows = bqrows.get(PipelineTags.webresourceTag);
	PCollection<TableRow> documentRows = bqrows.get(PipelineTags.documentTag);
	PCollection<TableRow> sentimentRows = bqrows.get(PipelineTags.sentimentTag);

	// Now write to BigQuery
	WriteDisposition dispo = options.getWriteTruncate() ? 
		WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
	
	//Merge all collections with WebResource table records
	PCollectionList<TableRow> webresourceRowsList = (webresourceDeduped == null) ?
		PCollectionList.of(webresourceRows).and(webresourceRowsUnindexed) :
		PCollectionList.of(webresourceRows).and(webresourceRowsUnindexed).and(webresourceDeduped);
			
	PCollection<TableRow> allWebresourceRows = 
		webresourceRowsList.apply(Flatten.<TableRow>pCollections());
			
	allWebresourceRows = !options.isStreaming() ? 
		allWebresourceRows.apply("Reshuffle Webresources", new Reshuffle<TableRow>()) : 
		allWebresourceRows;
	
	allWebresourceRows
		.apply("Write to webresource", 
			BigQueryIO.writeTableRows()
				.to(getWebResourcePartitionedTableRef(options)) 
				.withSchema(getWebResourceSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	documentRows = !options.isStreaming() ?
		documentRows.apply("Reshuffle Documents", new Reshuffle<TableRow>()):
		documentRows;
			
	documentRows
		.apply("Write to document", 
			BigQueryIO.writeTableRows()
				.to(getDocumentPartitionedTableRef(options))
				.withSchema(getDocumentTableSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	sentimentRows = !options.isStreaming() ?
		sentimentRows.apply("Reshuffle Sentiments", new Reshuffle<TableRow>()):
		sentimentRows;
			
	sentimentRows
		.apply("Write to sentiment", 
			BigQueryIO.writeTableRows()
				.to(getSentimentPartitionedTableRef(options)) 
				.withSchema(getSentimentSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo));
}
 
Example 19
Source File: CreateStreamTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testFlattenedWithWatermarkHold() throws IOException {
  Instant instant = new Instant(0);
  CreateStream<Integer> source1 =
      CreateStream.of(VarIntCoder.of(), batchDuration())
          .emptyBatch()
          .advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(5)))
          .nextBatch(
              TimestampedValue.of(1, instant),
              TimestampedValue.of(2, instant),
              TimestampedValue.of(3, instant))
          .advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(10)));
  CreateStream<Integer> source2 =
      CreateStream.of(VarIntCoder.of(), batchDuration())
          .emptyBatch()
          .advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(1)))
          .nextBatch(TimestampedValue.of(4, instant))
          .advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(2)))
          .nextBatch(TimestampedValue.of(5, instant))
          .advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(5)))
          .emptyBatch()
          .advanceNextBatchWatermarkToInfinity();

  PCollection<Integer> windowed1 =
      p.apply("CreateStream1", source1)
          .apply(
              "Window1",
              Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(5)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .accumulatingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));
  PCollection<Integer> windowed2 =
      p.apply("CreateStream2", source2)
          .apply(
              "Window2",
              Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(5)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .accumulatingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionList<Integer> pCollectionList = PCollectionList.of(windowed1).and(windowed2);
  PCollection<Integer> flattened = pCollectionList.apply(Flatten.pCollections());
  PCollection<Integer> triggered =
      flattened
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  IntervalWindow window = new IntervalWindow(instant, instant.plus(Duration.standardMinutes(5L)));
  PAssert.that(triggered).inOnTimePane(window).containsInAnyOrder(1, 2, 3, 4, 5);

  p.run();
}
 
Example 20
Source File: FlattenEvaluatorFactoryTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testFlattenInMemoryEvaluator() throws Exception {
  PCollection<Integer> left = p.apply("left", Create.of(1, 2, 4));
  PCollection<Integer> right = p.apply("right", Create.of(-1, 2, -4));
  PCollectionList<Integer> list = PCollectionList.of(left).and(right);

  PCollection<Integer> flattened = list.apply(Flatten.pCollections());

  CommittedBundle<Integer> leftBundle = bundleFactory.createBundle(left).commit(Instant.now());
  CommittedBundle<Integer> rightBundle = bundleFactory.createBundle(right).commit(Instant.now());

  EvaluationContext context = mock(EvaluationContext.class);

  UncommittedBundle<Integer> flattenedLeftBundle = bundleFactory.createBundle(flattened);
  UncommittedBundle<Integer> flattenedRightBundle = bundleFactory.createBundle(flattened);

  when(context.createBundle(flattened)).thenReturn(flattenedLeftBundle, flattenedRightBundle);

  FlattenEvaluatorFactory factory = new FlattenEvaluatorFactory(context);
  AppliedPTransform<?, ?, ?> flattenedProducer = DirectGraphs.getProducer(flattened);
  TransformEvaluator<Integer> leftSideEvaluator =
      factory.forApplication(flattenedProducer, leftBundle);
  TransformEvaluator<Integer> rightSideEvaluator =
      factory.forApplication(flattenedProducer, rightBundle);

  leftSideEvaluator.processElement(WindowedValue.valueInGlobalWindow(1));
  rightSideEvaluator.processElement(WindowedValue.valueInGlobalWindow(-1));
  leftSideEvaluator.processElement(
      WindowedValue.timestampedValueInGlobalWindow(2, new Instant(1024)));
  leftSideEvaluator.processElement(WindowedValue.valueInGlobalWindow(4, PaneInfo.NO_FIRING));
  rightSideEvaluator.processElement(
      WindowedValue.valueInGlobalWindow(2, PaneInfo.ON_TIME_AND_ONLY_FIRING));
  rightSideEvaluator.processElement(
      WindowedValue.timestampedValueInGlobalWindow(-4, new Instant(-4096)));

  TransformResult<Integer> rightSideResult = rightSideEvaluator.finishBundle();
  TransformResult<Integer> leftSideResult = leftSideEvaluator.finishBundle();

  assertThat(rightSideResult.getOutputBundles(), Matchers.contains(flattenedRightBundle));
  assertThat(
      rightSideResult.getTransform(),
      Matchers.<AppliedPTransform<?, ?, ?>>equalTo(flattenedProducer));
  assertThat(leftSideResult.getOutputBundles(), Matchers.contains(flattenedLeftBundle));
  assertThat(
      leftSideResult.getTransform(),
      Matchers.<AppliedPTransform<?, ?, ?>>equalTo(flattenedProducer));

  assertThat(
      flattenedLeftBundle.commit(Instant.now()).getElements(),
      containsInAnyOrder(
          WindowedValue.timestampedValueInGlobalWindow(2, new Instant(1024)),
          WindowedValue.valueInGlobalWindow(4, PaneInfo.NO_FIRING),
          WindowedValue.valueInGlobalWindow(1)));
  assertThat(
      flattenedRightBundle.commit(Instant.now()).getElements(),
      containsInAnyOrder(
          WindowedValue.valueInGlobalWindow(2, PaneInfo.ON_TIME_AND_ONLY_FIRING),
          WindowedValue.timestampedValueInGlobalWindow(-4, new Instant(-4096)),
          WindowedValue.valueInGlobalWindow(-1)));
}