org.apache.beam.sdk.transforms.Flatten Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.Flatten. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DirectGraphVisitorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void getValueToConsumersWithDuplicateInputSucceeds() {
  PCollection<String> created = p.apply(Create.of("1", "2", "3"));

  PCollection<String> flattened =
      PCollectionList.of(created).and(created).apply(Flatten.pCollections());

  p.traverseTopologically(visitor);

  DirectGraph graph = visitor.getGraph();
  AppliedPTransform<?, ?, ?> flattenedProducer = graph.getProducer(flattened);

  assertThat(
      graph.getPerElementConsumers(created),
      Matchers.containsInAnyOrder(new Object[] {flattenedProducer, flattenedProducer}));
  assertThat(graph.getPerElementConsumers(flattened), emptyIterable());
}
 
Example #2
Source File: FlattenTranslatorBatch.java    From twister2 with Apache License 2.0 6 votes vote down vote up
@Override
public void translateNode(
    Flatten.PCollections<T> transform, Twister2BatchTranslationContext context) {
  Collection<PValue> pcs = context.getInputs().values();
  List<BatchTSetImpl<WindowedValue<T>>> tSets = new ArrayList<>();
  BatchTSetImpl<WindowedValue<T>> unionTSet = null;
  if (pcs.isEmpty()) {
    // TODO: create empty TSet
    throw new UnsupportedOperationException("Operation not implemented yet");
  } else {
    for (PValue pc : pcs) {
      BatchTSetImpl<WindowedValue<T>> curr = context.getInputDataSet(pc);
      tSets.add(curr);
    }

    BatchTSetImpl<WindowedValue<T>> first = tSets.remove(0);
    Collection<TSet<WindowedValue<T>>> others = new ArrayList<>();
    others.addAll(tSets);
    if (tSets.size() > 0) {
      unionTSet = first.union(others);
    } else {
      unionTSet = first;
    }
  }
  context.setOutputDataSet(context.getOutput(transform), unionTSet);
}
 
Example #3
Source File: FileIndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}
 
Example #4
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param filteredIndexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
	
	PCollectionTuple splitAB = filteredIndexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	filteredIndexes = allIndexSummaries;
	return filteredIndexes;
}
 
Example #5
Source File: SideInputLoadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private void performTestWithMap(
    PCollection<KV<byte[], byte[]>> input, Optional<SyntheticStep> syntheticStep) {
  applyStepIfPresent(input, "Synthetic step", syntheticStep);
  PCollectionView<Map<byte[], byte[]>> sideInput =
      applyWindowingIfPresent(input).apply(View.asMap());
  PCollectionView<List<byte[]>> randomKeys =
      pipeline
          .apply(Create.of(0))
          .apply(
              ParDo.of(new GetRandomKeyList(sideInput, options.getAccessPercentage()))
                  .withSideInputs(sideInput))
          .apply(Flatten.iterables())
          .apply(View.asList());

  input
      .apply(
          ParDo.of(new SideInputTestWithMap(sideInput, randomKeys))
              .withSideInputs(sideInput, randomKeys))
      .apply("Collect end time metrics", ParDo.of(runtimeMonitor));
}
 
Example #6
Source File: Window.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  applicableTo(input);

  WindowingStrategy<?, ?> outputStrategy =
      getOutputStrategyInternal(input.getWindowingStrategy());

  if (getWindowFn() == null) {
    // A new PCollection must be created in case input is reused in a different location as the
    // two PCollections will, in general, have a different windowing strategy.
    return PCollectionList.of(input)
        .apply(Flatten.pCollections())
        .setWindowingStrategyInternal(outputStrategy);
  } else {
    // This is the AssignWindows primitive
    return input.apply(new Assign<>(this, outputStrategy));
  }
}
 
Example #7
Source File: PAssert.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionView<ActualT> expand(PBegin input) {
  final Coder<T> coder = actual.getCoder();
  return actual
      .apply("FilterActuals", rewindowActuals.prepareActuals())
      .apply("GatherPanes", GatherAllPanes.globally())
      .apply("ExtractPane", MapElements.via(extractPane))
      .setCoder(IterableCoder.of(actual.getCoder()))
      .apply(Flatten.iterables())
      .apply("RewindowActuals", rewindowActuals.windowActuals())
      .apply(
          ParDo.of(
              new DoFn<T, T>() {
                @ProcessElement
                public void processElement(ProcessContext context) throws CoderException {
                  context.output(CoderUtils.clone(coder, context.element()));
                }
              }))
      .apply(actualView);
}
 
Example #8
Source File: TestStreamTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({NeedsRunner.class, UsesTestStream.class})
public void testElementsAtAlmostPositiveInfinity() {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  TestStream<String> stream =
      TestStream.create(StringUtf8Coder.of())
          .addElements(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(stream)
          .apply(into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(endOfGlobalWindow))
      .containsInAnyOrder("foo", "bar");
  p.run();
}
 
Example #9
Source File: PTransformMatchers.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * A {@link PTransformMatcher} which matches a {@link Flatten.PCollections} which consumes a
 * single input {@link PCollection} multiple times.
 */
public static PTransformMatcher flattenWithDuplicateInputs() {
  return new PTransformMatcher() {
    @Override
    public boolean matches(AppliedPTransform<?, ?, ?> application) {
      if (application.getTransform() instanceof Flatten.PCollections) {
        Set<PValue> observed = new HashSet<>();
        for (PValue pvalue : application.getInputs().values()) {
          boolean firstInstance = observed.add(pvalue);
          if (!firstInstance) {
            return true;
          }
        }
      }
      return false;
    }

    @Override
    public String toString() {
      return MoreObjects.toStringHelper("FlattenWithDuplicateInputsMatcher").toString();
    }
  };
}
 
Example #10
Source File: DeduplicatedFlattenFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void duplicatesInsertsMultipliers() {
  PTransform<PCollectionList<String>, PCollection<String>> replacement =
      new DeduplicatedFlattenFactory.FlattenWithoutDuplicateInputs<>();
  final PCollectionList<String> inputList =
      PCollectionList.of(first).and(second).and(first).and(first);
  inputList.apply(replacement);
  pipeline.traverseTopologically(
      new Defaults() {
        @Override
        public void visitPrimitiveTransform(TransformHierarchy.Node node) {
          if (node.getTransform() instanceof Flatten.PCollections) {
            assertThat(node.getInputs(), not(equalTo(inputList.expand())));
          }
        }
      });
}
 
Example #11
Source File: PTransformMatchersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void emptyFlattenWithNonEmptyFlatten() {
  AppliedPTransform application =
      AppliedPTransform.of(
          "Flatten",
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Flatten.pCollections(),
          p);

  assertThat(PTransformMatchers.emptyFlatten().matches(application), is(false));
}
 
Example #12
Source File: PTransformMatchersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void flattenWithDuplicateInputsWithoutDuplicates() {
  AppliedPTransform application =
      AppliedPTransform.of(
          "Flatten",
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Flatten.pCollections(),
          p);

  assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(false));
}
 
Example #13
Source File: PTransformMatchersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void flattenWithDuplicateInputsNonFlatten() {
  AppliedPTransform application =
      AppliedPTransform
          .<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of(
              "EmptyFlatten",
              Collections.emptyMap(),
              Collections.singletonMap(
                  new TupleTag<Integer>(),
                  PCollection.createPrimitiveOutputInternal(
                      p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
              /* This isn't actually possible to construct, but for the sake of example */
              Flatten.iterables(),
              p);

  assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(false));
}
 
Example #14
Source File: UnconsumedReadsTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void doesNotConsumeAlreadyConsumedRead() {
  Unbounded<Long> transform = Read.from(CountingSource.unbounded());
  final PCollection<Long> output = pipeline.apply(transform);
  final Flatten.PCollections<Long> consumer = Flatten.pCollections();
  PCollectionList.of(output).apply(consumer);
  UnconsumedReads.ensureAllReadsConsumed(pipeline);
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          // The output should only be consumed by a single consumer
          if (node.getInputs().values().contains(output)) {
            assertThat(node.getTransform(), Matchers.is(consumer));
          }
        }
      });
}
 
Example #15
Source File: EmptyFlattenAsCreateFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void getInputNonEmptyThrows() {
  PCollectionList<Long> nonEmpty =
      PCollectionList.of(pipeline.apply("unbounded", GenerateSequence.from(0)))
          .and(pipeline.apply("bounded", GenerateSequence.from(0).to(100)));
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(nonEmpty.expand().toString());
  thrown.expectMessage(EmptyFlattenAsCreateFactory.class.getSimpleName());
  factory.getReplacementTransform(
      AppliedPTransform.of(
          "nonEmptyInput",
          nonEmpty.expand(),
          Collections.emptyMap(),
          Flatten.pCollections(),
          pipeline));
}
 
Example #16
Source File: EmptyFlattenAsCreateFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testOverride() {
  PCollectionList<Long> empty = PCollectionList.empty(pipeline);
  PCollection<Long> emptyFlattened =
      empty.apply(
          factory
              .getReplacementTransform(
                  AppliedPTransform.of(
                      "nonEmptyInput",
                      Collections.emptyMap(),
                      Collections.emptyMap(),
                      Flatten.pCollections(),
                      pipeline))
              .getTransform());
  PAssert.that(emptyFlattened).empty();
  pipeline.run();
}
 
Example #17
Source File: QueryablePipelineTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that {@link QueryablePipeline#getPerElementConsumers(PCollectionNode)} returns a
 * transform that consumes the node more than once.
 */
@Test
public void perElementConsumersWithConsumingMultipleTimes() {
  Pipeline p = Pipeline.create();
  PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));
  PCollectionList.of(longs).and(longs).and(longs).apply("flatten", Flatten.pCollections());

  Components components = PipelineTranslation.toProto(p).getComponents();
  // This breaks if the way that IDs are assigned to PTransforms changes in PipelineTranslation
  String readOutput =
      getOnlyElement(components.getTransformsOrThrow("BoundedRead").getOutputsMap().values());
  QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);
  Set<PTransformNode> consumers =
      qp.getPerElementConsumers(
          PipelineNode.pCollection(readOutput, components.getPcollectionsOrThrow(readOutput)));

  assertThat(consumers.size(), equalTo(1));
  assertThat(
      getOnlyElement(consumers).getTransform().getSpec().getUrn(),
      equalTo(PTransformTranslation.FLATTEN_TRANSFORM_URN));
}
 
Example #18
Source File: PTransformMatchersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void emptyFlattenWithNonFlatten() {
  AppliedPTransform application =
      AppliedPTransform
          .<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of(
              "EmptyFlatten",
              Collections.emptyMap(),
              Collections.singletonMap(
                  new TupleTag<Integer>(),
                  PCollection.createPrimitiveOutputInternal(
                      p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
              /* This isn't actually possible to construct, but for the sake of example */
              Flatten.iterables(),
              p);

  assertThat(PTransformMatchers.emptyFlatten().matches(application), is(false));
}
 
Example #19
Source File: CreateStreamTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testElementsAtAlmostPositiveInfinity() throws IOException {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  CreateStream<String> source =
      CreateStream.of(StringUtf8Coder.of(), batchDuration())
          .nextBatch(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceNextBatchWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(source)
          .apply(Window.into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(GlobalWindow.INSTANCE.maxTimestamp()))
      .containsInAnyOrder("foo", "bar");
  p.run();
}
 
Example #20
Source File: TrackStreamingSourcesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testTrackFlattened() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> queueStream1 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();
  CreateStream<Integer> queueStream2 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  PCollection<Integer> pcol1 = p.apply(queueStream1);
  PCollection<Integer> pcol2 = p.apply(queueStream2);
  PCollection<Integer> flattened =
      PCollectionList.of(pcol1).and(pcol2).apply(Flatten.pCollections());
  flattened.apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}
 
Example #21
Source File: Sink.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 */
public static PipelineResult run(SinkOptions.Parsed options) {
  final Pipeline pipeline = Pipeline.create(options);
  final List<PCollection<PubsubMessage>> failureCollections = new ArrayList<>();

  pipeline //
      .apply(options.getInputType().read(options)) //
      .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) //
      .apply(options.getOutputType().write(options)).failuresTo(failureCollections);

  PCollectionList.of(failureCollections) //
      .apply("FlattenFailureCollections", Flatten.pCollections()) //
      .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) //
      .output();

  return pipeline.run();
}
 
Example #22
Source File: WatermarkManagerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() {

  createdInts = p.apply("createdInts", Create.of(1, 2, 3));

  filtered = createdInts.apply("filtered", Filter.greaterThan(1));
  filteredTimesTwo =
      filtered.apply(
          "timesTwo",
          ParDo.of(
              new DoFn<Integer, Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) throws Exception {
                  c.output(c.element() * 2);
                }
              }));

  keyed = createdInts.apply("keyed", WithKeys.of("MyKey"));

  intsToFlatten = p.apply("intsToFlatten", Create.of(-1, 256, 65535));
  PCollectionList<Integer> preFlatten = PCollectionList.of(createdInts).and(intsToFlatten);
  flattened = preFlatten.apply("flattened", Flatten.pCollections());

  clock = MockClock.fromInstant(new Instant(1000));
  DirectGraphs.performDirectOverrides(p);
  graph = DirectGraphs.getGraph(p);

  manager = WatermarkManager.create(clock, graph, AppliedPTransform::getFullName);
  bundleFactory = ImmutableListBundleFactory.create();
}
 
Example #23
Source File: FlattenTranslatorBatch.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(
    Flatten.PCollections<T> transform, Twister2BatchTranslationContext context) {
  Collection<PValue> pcs = context.getInputs().values();
  List<BatchTSetImpl<WindowedValue<T>>> tSets = new ArrayList<>();
  BatchTSetImpl<WindowedValue<T>> unionTSet;

  if (pcs.isEmpty()) {
    final TSetEnvironment tsetEnv = context.getEnvironment();
    unionTSet =
        ((BatchTSetEnvironment) tsetEnv)
            .createSource(new Twister2EmptySource(), context.getOptions().getParallelism());
  } else {
    for (PValue pc : pcs) {
      BatchTSetImpl<WindowedValue<T>> curr = context.getInputDataSet(pc);
      tSets.add(curr);
    }

    BatchTSetImpl<WindowedValue<T>> first = tSets.remove(0);
    Collection<TSet<WindowedValue<T>>> others = new ArrayList<>();
    others.addAll(tSets);
    if (tSets.size() > 0) {
      unionTSet = first.union(others);
    } else {
      unionTSet = first;
    }
  }
  context.setOutputDataSet(context.getOutput(transform), unionTSet);
}
 
Example #24
Source File: PTransformMatchersTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void emptyFlattenWithEmptyFlatten() {
  AppliedPTransform application =
      AppliedPTransform.of(
          "EmptyFlatten",
          Collections.emptyMap(),
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Flatten.pCollections(),
          p);

  assertThat(PTransformMatchers.emptyFlatten().matches(application), is(true));
}
 
Example #25
Source File: AtomicInsertTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PBegin input) {
  Schema schema = Schema.of(Schema.Field.of("f0", Schema.FieldType.INT64));
  Iterable<Row> bundle =
      IntStream.range(0, size)
          .mapToObj(x -> Row.withSchema(schema).addValue((long) x).build())
          .collect(Collectors.toList());

  // make sure we get one big bundle
  return input
      .getPipeline()
      .apply(Create.<Iterable<Row>>of(bundle).withCoder(IterableCoder.of(RowCoder.of(schema))))
      .apply(Flatten.iterables())
      .setRowSchema(schema);
}
 
Example #26
Source File: BatchLoads.java    From beam with Apache License 2.0 5 votes vote down vote up
PCollection<WriteBundlesToFiles.Result<DestinationT>> writeDynamicallyShardedFiles(
    PCollection<KV<DestinationT, ElementT>> input, PCollectionView<String> tempFilePrefix) {
  TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag =
      new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {};
  TupleTag<KV<ShardedKey<DestinationT>, ElementT>> unwrittedRecordsTag =
      new TupleTag<KV<ShardedKey<DestinationT>, ElementT>>("unwrittenRecords") {};
  PCollectionTuple writeBundlesTuple =
      input.apply(
          "WriteBundlesToFiles",
          ParDo.of(
                  new WriteBundlesToFiles<>(
                      tempFilePrefix,
                      unwrittedRecordsTag,
                      maxNumWritersPerBundle,
                      maxFileSize,
                      rowWriterFactory))
              .withSideInputs(tempFilePrefix)
              .withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
  PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles =
      writeBundlesTuple
          .get(writtenFilesTag)
          .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
  PCollection<KV<ShardedKey<DestinationT>, ElementT>> unwrittenRecords =
      writeBundlesTuple
          .get(unwrittedRecordsTag)
          .setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), elementCoder));

  // If the bundles contain too many output tables to be written inline to files (due to memory
  // limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
  // Group these records by key, and write the files after grouping. Since the record is grouped
  // by key, we can ensure that only one file is open at a time in each bundle.
  PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped =
      writeShardedRecords(unwrittenRecords, tempFilePrefix);

  // PCollection of filename, file byte size, and table destination.
  return PCollectionList.of(writtenFiles)
      .and(writtenFilesGrouped)
      .apply("FlattenFiles", Flatten.pCollections())
      .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
}
 
Example #27
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnboundedSourceSplits() throws Exception {

  int numElements = 1000;
  int numSplits = 10;

  // Coders must be specified explicitly here due to the way the transform
  // is used in the test.
  UnboundedSource<KafkaRecord<Integer, Long>, ?> initial =
      mkKafkaReadTransform(numElements, null)
          .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of())
          .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of())
          .makeSource();

  List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits =
      initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))
                .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>()))
                .apply("collection " + i, Values.create()));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  p.run();
}
 
Example #28
Source File: UnionTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<InputT> translate(Union<InputT> operator, PCollectionList<InputT> inputs) {
  final TypeDescriptor<InputT> outputType = operator.getOutputType().orElse(null);
  return operator
      .getName()
      .map(name -> inputs.apply(name, Flatten.pCollections()).setTypeDescriptor(outputType))
      .orElseGet(() -> inputs.apply(Flatten.pCollections()).setTypeDescriptor(outputType));
}
 
Example #29
Source File: FlattenTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testFlatten() {
  PCollection<Integer> input1 = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
  PCollection<Integer> input2 = pipeline.apply(Create.of(11, 12, 13, 14, 15, 16, 17, 18, 19, 20));
  PCollectionList<Integer> pcs = PCollectionList.of(input1).and(input2);
  PCollection<Integer> input = pcs.apply(Flatten.pCollections());
  PAssert.that(input)
      .containsInAnyOrder(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
  pipeline.run();
}
 
Example #30
Source File: PTransformMatchers.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * A {@link PTransformMatcher} which matches a {@link Flatten.PCollections} which consumes no
 * input {@link PCollection PCollections}.
 */
public static PTransformMatcher emptyFlatten() {
  return new PTransformMatcher() {
    @Override
    public boolean matches(AppliedPTransform<?, ?, ?> application) {
      return (application.getTransform() instanceof Flatten.PCollections)
          && application.getInputs().isEmpty();
    }

    @Override
    public String toString() {
      return MoreObjects.toStringHelper("EmptyFlattenMatcher").toString();
    }
  };
}