org.apache.beam.sdk.values.PCollectionList Java Examples

The following examples show how to use org.apache.beam.sdk.values.PCollectionList. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SetsTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testIntersectionCollectionList() {

  PCollection<String> third = p.apply("third", Create.of(Arrays.asList("b", "b", "c", "f")));
  PCollection<Row> thirdRows = p.apply("thirdRows", Create.of(toRows("b", "b", "c", "f")));

  PAssert.that(
          PCollectionList.of(first)
              .and(second)
              .and(third)
              .apply("stringsCols", Sets.intersectDistinct()))
      .containsInAnyOrder("b", "c");

  PCollection<Row> results =
      PCollectionList.of(firstRows)
          .and(secondRows)
          .and(thirdRows)
          .apply("rowCols", Sets.intersectDistinct());

  PAssert.that(results).containsInAnyOrder(toRows("b", "c"));

  assertEquals(schema, results.getSchema());

  p.run();
}
 
Example #2
Source File: RepublishPerNamespace.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerNamespaceDestinations().entrySet().stream()
      .map(entry -> new Destination(entry.getKey(), entry.getValue()))
      .collect(Collectors.toList());
  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByNamespace",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);
    opts.setOutput(StaticValueProvider.of(destination.dest));
    String name = String.join("_", "republish", destination.namespace);
    partitioned.get(i).apply(name, opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}
 
Example #3
Source File: BeamSqlRelUtils.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * A {@link BeamRelNode} is a recursive structure, the {@code BeamQueryPlanner} visits it with a
 * DFS(Depth-First-Search) algorithm.
 */
static PCollection<Row> toPCollection(
    Pipeline pipeline, BeamRelNode node, Map<Integer, PCollection<Row>> cache) {
  PCollection<Row> output = cache.get(node.getId());
  if (output != null) {
    return output;
  }

  String name = node.getClass().getSimpleName() + "_" + node.getId();
  PCollectionList<Row> input = buildPCollectionList(node.getPCollectionInputs(), pipeline, cache);
  PTransform<PCollectionList<Row>, PCollection<Row>> transform = node.buildPTransform();
  output = Pipeline.applyTransform(name, input, transform);

  cache.put(node.getId(), output);
  return output;
}
 
Example #4
Source File: FlattenTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesSideInputs.class})
public void testEmptyFlattenAsSideInput() {
  final PCollectionView<Iterable<String>> view =
      PCollectionList.<String>empty(p)
          .apply(Flatten.pCollections())
          .setCoder(StringUtf8Coder.of())
          .apply(View.asIterable());

  PCollection<String> output =
      p.apply(Create.of((Void) null).withCoder(VoidCoder.of()))
          .apply(
              ParDo.of(
                      new DoFn<Void, String>() {
                        @ProcessElement
                        public void processElement(ProcessContext c) {
                          for (String side : c.sideInput(view)) {
                            c.output(side);
                          }
                        }
                      })
                  .withSideInputs(view));

  PAssert.that(output).empty();
  p.run();
}
 
Example #5
Source File: BeamTableFunctionScanRel.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> input) {
  checkArgument(
      input.size() == 1,
      "Wrong number of inputs for %s, expected 1 input but received: %s",
      BeamTableFunctionScanRel.class.getSimpleName(),
      input);
  String operatorName = ((RexCall) getCall()).getOperator().getName();
  checkArgument(
      tvfToPTransformMap.keySet().contains(operatorName),
      "Only support %s table-valued functions. Current operator: %s",
      tvfToPTransformMap.keySet(),
      operatorName);

  return tvfToPTransformMap.get(operatorName).toPTransform(((RexCall) getCall()), input.get(0));
}
 
Example #6
Source File: BeamUncollectRel.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 1,
      "Wrong number of inputs for %s: %s",
      BeamUncollectRel.class.getSimpleName(),
      pinput);
  PCollection<Row> upstream = pinput.get(0);

  // Each row of the input contains a single array of things to be emitted; Calcite knows
  // what the row looks like
  Schema outputSchema = CalciteUtils.toSchema(getRowType());

  PCollection<Row> uncollected =
      upstream.apply(ParDo.of(new UncollectDoFn(outputSchema))).setRowSchema(outputSchema);

  return uncollected;
}
 
Example #7
Source File: Sink.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 */
public static PipelineResult run(SinkOptions.Parsed options) {
  final Pipeline pipeline = Pipeline.create(options);
  final List<PCollection<PubsubMessage>> failureCollections = new ArrayList<>();

  pipeline //
      .apply(options.getInputType().read(options)) //
      .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) //
      .apply(options.getOutputType().write(options)).failuresTo(failureCollections);

  PCollectionList.of(failureCollections) //
      .apply("FlattenFailureCollections", Flatten.pCollections()) //
      .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) //
      .output();

  return pipeline.run();
}
 
Example #8
Source File: Partition.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionList<T> expand(PCollection<T> in) {
  final TupleTagList outputTags = partitionDoFn.getOutputTags();

  PCollectionTuple outputs =
      in.apply(
          ParDo.of(partitionDoFn)
              .withOutputTags(new TupleTag<Void>() {}, outputTags)
              .withSideInputs(partitionDoFn.getSideInputs()));

  PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
  Coder<T> coder = in.getCoder();

  for (TupleTag<?> outputTag : outputTags.getAll()) {
    // All the tuple tags are actually TupleTag<T>
    // And all the collections are actually PCollection<T>
    @SuppressWarnings("unchecked")
    TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
    pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
  }
  return pcs;
}
 
Example #9
Source File: Window.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  applicableTo(input);

  WindowingStrategy<?, ?> outputStrategy =
      getOutputStrategyInternal(input.getWindowingStrategy());

  if (getWindowFn() == null) {
    // A new PCollection must be created in case input is reused in a different location as the
    // two PCollections will, in general, have a different windowing strategy.
    return PCollectionList.of(input)
        .apply(Flatten.pCollections())
        .setWindowingStrategyInternal(outputStrategy);
  } else {
    // This is the AssignWindows primitive
    return input.apply(new Assign<>(this, outputStrategy));
  }
}
 
Example #10
Source File: EmptyFlattenAsCreateFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testOverride() {
  PCollectionList<Long> empty = PCollectionList.empty(pipeline);
  PCollection<Long> emptyFlattened =
      empty.apply(
          factory
              .getReplacementTransform(
                  AppliedPTransform.of(
                      "nonEmptyInput",
                      Collections.emptyMap(),
                      Collections.emptyMap(),
                      Flatten.pCollections(),
                      pipeline))
              .getTransform());
  PAssert.that(emptyFlattened).empty();
  pipeline.run();
}
 
Example #11
Source File: FlattenTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(ValidatesRunner.class)
public void testFlattenWithDifferentInputAndOutputCoders2() {
  // This test exists to prevent a regression in Dataflow. It tests a
  // GroupByKey followed by a Flatten with an SDK-specific output coder.
  PCollection<KV<String, Iterable<String>>> flattenInput =
      p.apply(Create.of(LINES))
          .apply(WithKeys.of("a"))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))
          .apply(GroupByKey.create());
  PCollection<String> output =
      PCollectionList.of(flattenInput)
          .apply(Flatten.pCollections())
          .setCoder(SerializableCoder.of(new TypeDescriptor<KV<String, Iterable<String>>>() {}))
          .apply(Values.create())
          .setCoder(IterableCoder.of(StringUtf8Coder.of()))
          .apply(
              FlatMapElements.into(TypeDescriptors.strings())
                  .via((Iterable<String> values) -> values));
  PAssert.that(output).containsInAnyOrder(LINES);
  p.run();
}
 
Example #12
Source File: FlattenTranslatorBatch.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public void translateTransform(
    PTransform<PCollectionList<T>, PCollection<T>> transform, TranslationContext context) {
  Collection<PValue> pcollectionList = context.getInputs().values();
  Dataset<WindowedValue<T>> result = null;
  if (pcollectionList.isEmpty()) {
    result = context.emptyDataset();
  } else {
    for (PValue pValue : pcollectionList) {
      checkArgument(
          pValue instanceof PCollection,
          "Got non-PCollection input to flatten: %s of type %s",
          pValue,
          pValue.getClass().getSimpleName());
      @SuppressWarnings("unchecked")
      PCollection<T> pCollection = (PCollection<T>) pValue;
      Dataset<WindowedValue<T>> current = context.getDataset(pCollection);
      if (result == null) {
        result = current;
      } else {
        result = result.union(current);
      }
    }
  }
  context.putDataset(context.getOutput(), result);
}
 
Example #13
Source File: TaskTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void groupByKey() {
  PCollection<Integer> numbers =
      testPipeline.apply(
          Create.of(1, 2, 3, 4, 5, 100, 110, 150, 250)
      );

  PCollectionList<Integer> results = Task.applyTransform(numbers);

  PAssert.that(results.get(0))
      .containsInAnyOrder(110, 150, 250);

  PAssert.that(results.get(1))
      .containsInAnyOrder(1, 2, 3, 4, 5, 100);

  testPipeline.run().waitUntilFinish();
}
 
Example #14
Source File: ExpansionService.java    From beam with Apache License 2.0 6 votes vote down vote up
default Map<String, PCollection<?>> extractOutputs(OutputT output) {
  if (output instanceof PDone) {
    return Collections.emptyMap();
  } else if (output instanceof PCollection) {
    return ImmutableMap.of("output", (PCollection<?>) output);
  } else if (output instanceof PCollectionTuple) {
    return ((PCollectionTuple) output)
        .getAll().entrySet().stream()
            .collect(Collectors.toMap(entry -> entry.getKey().getId(), Map.Entry::getValue));
  } else if (output instanceof PCollectionList<?>) {
    PCollectionList<?> listOutput = (PCollectionList<?>) output;
    return IntStream.range(0, listOutput.size())
        .boxed()
        .collect(Collectors.toMap(Object::toString, listOutput::get));
  } else {
    throw new UnsupportedOperationException("Unknown output type: " + output.getClass());
  }
}
 
Example #15
Source File: Combine.java    From beam with Apache License 2.0 6 votes vote down vote up
private PCollection<OutputT> insertDefaultValueIfEmpty(PCollection<OutputT> maybeEmpty) {
  final PCollectionView<Iterable<OutputT>> maybeEmptyView = maybeEmpty.apply(View.asIterable());

  final OutputT defaultValue = fn.defaultValue();
  PCollection<OutputT> defaultIfEmpty =
      maybeEmpty
          .getPipeline()
          .apply("CreateVoid", Create.of((Void) null).withCoder(VoidCoder.of()))
          .apply(
              "ProduceDefault",
              ParDo.of(
                      new DoFn<Void, OutputT>() {
                        @ProcessElement
                        public void processElement(ProcessContext c) {
                          Iterator<OutputT> combined = c.sideInput(maybeEmptyView).iterator();
                          if (!combined.hasNext()) {
                            c.output(defaultValue);
                          }
                        }
                      })
                  .withSideInputs(maybeEmptyView))
          .setCoder(maybeEmpty.getCoder())
          .setWindowingStrategyInternal(maybeEmpty.getWindowingStrategy());

  return PCollectionList.of(maybeEmpty).and(defaultIfEmpty).apply(Flatten.pCollections());
}
 
Example #16
Source File: EmptyFlattenAsCreateFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void getInputNonEmptyThrows() {
  PCollectionList<Long> nonEmpty =
      PCollectionList.of(pipeline.apply("unbounded", GenerateSequence.from(0)))
          .and(pipeline.apply("bounded", GenerateSequence.from(0).to(100)));
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(nonEmpty.expand().toString());
  thrown.expectMessage(EmptyFlattenAsCreateFactory.class.getSimpleName());
  factory.getReplacementTransform(
      AppliedPTransform.of(
          "nonEmptyInput",
          nonEmpty.expand(),
          Collections.emptyMap(),
          Flatten.pCollections(),
          pipeline));
}
 
Example #17
Source File: BeamPushDownIOSourceRel.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> input) {
  checkArgument(
      input.size() == 0,
      "Should not have received input for %s: %s",
      BeamIOSourceRel.class.getSimpleName(),
      input);

  final PBegin begin = input.getPipeline().begin();
  final BeamSqlTable beamSqlTable = BeamPushDownIOSourceRel.this.getBeamSqlTable();

  if (usedFields.isEmpty() && tableFilters instanceof DefaultTableFilter) {
    return beamSqlTable.buildIOReader(begin);
  }

  final Schema newBeamSchema = CalciteUtils.toSchema(getRowType());
  return beamSqlTable
      .buildIOReader(begin, tableFilters, usedFields)
      .setRowSchema(newBeamSchema);
}
 
Example #18
Source File: SetsTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testExceptCollectionList() {
  PCollection<String> third = p.apply("third", Create.of(Arrays.asList("a", "b", "b", "g", "g")));
  PCollection<Row> thirdRows = p.apply("thirdRows", Create.of(toRows("a", "b", "b", "g", "g")));

  PAssert.that(
          PCollectionList.of(first)
              .and(second)
              .and(third)
              .apply("stringsCols", Sets.exceptDistinct()))
      .containsInAnyOrder("h");

  PCollection<Row> results =
      PCollectionList.of(firstRows)
          .and(secondRows)
          .and(thirdRows)
          .apply("rowCols", Sets.exceptDistinct());

  PAssert.that(results).containsInAnyOrder(toRows("h"));

  assertEquals(schema, results.getSchema());

  p.run();
}
 
Example #19
Source File: UnconsumedReadsTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void doesNotConsumeAlreadyConsumedRead() {
  Unbounded<Long> transform = Read.from(CountingSource.unbounded());
  final PCollection<Long> output = pipeline.apply(transform);
  final Flatten.PCollections<Long> consumer = Flatten.pCollections();
  PCollectionList.of(output).apply(consumer);
  UnconsumedReads.ensureAllReadsConsumed(pipeline);
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          // The output should only be consumed by a single consumer
          if (node.getInputs().values().contains(output)) {
            assertThat(node.getTransform(), Matchers.is(consumer));
          }
        }
      });
}
 
Example #20
Source File: SetsTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testUnionAllCollections() {

  PCollection<String> third = p.apply("third", Create.of(Arrays.asList("a", "b", "b", "k", "k")));
  PCollection<Row> thirdRows = p.apply("thirdRows", Create.of(toRows("a", "b", "b", "k", "k")));

  PAssert.that(
          PCollectionList.of(first).and(second).and(third).apply("stringsCols", Sets.unionAll()))
      .containsInAnyOrder(
          "a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b", "b", "c", "c", "d", "d",
          "d", "d", "e", "e", "f", "f", "g", "g", "h", "h", "k", "k");

  PCollection<Row> results =
      PCollectionList.of(firstRows)
          .and(secondRows)
          .and(thirdRows)
          .apply("rowCols", Sets.unionAll());

  PAssert.that(results)
      .containsInAnyOrder(
          toRows(
              "a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b", "b", "c", "c", "d", "d",
              "d", "d", "e", "e", "f", "f", "g", "g", "h", "h", "k", "k"));

  assertEquals(schema, results.getSchema());

  p.run();
}
 
Example #21
Source File: BeamSideInputLookupJoinRel.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  Schema schema = CalciteUtils.toSchema(getRowType());

  BeamRelNode seekableRel =
      BeamSqlRelUtils.getBeamRelInput(getInput(seekableInputIndex().get()));
  BeamRelNode nonSeekableRel =
      BeamSqlRelUtils.getBeamRelInput(getInput(nonSeekableInputIndex().get()));

  // Offset field references according to which table is on the left
  int factColOffset =
      nonSeekableInputIndex().get() == 0
          ? 0
          : CalciteUtils.toSchema(seekableRel.getRowType()).getFieldCount();
  int lkpColOffset =
      seekableInputIndex().get() == 0
          ? 0
          : CalciteUtils.toSchema(nonSeekableRel.getRowType()).getFieldCount();

  // HACK: if the input is an immediate instance of a seekable IO, we can do lookups
  // so we ignore the PCollection
  BeamIOSourceRel seekableInput = (BeamIOSourceRel) seekableRel;
  BeamSqlSeekableTable seekableTable = (BeamSqlSeekableTable) seekableInput.getBeamSqlTable();

  // getPCollectionInputs() ensures that there is only one and it is the non-seekable input
  PCollection<Row> nonSeekableInput = pinput.get(0);

  return nonSeekableInput
      .apply(
          "join_as_lookup",
          new BeamJoinTransforms.JoinAsLookup(
              condition,
              seekableTable,
              CalciteUtils.toSchema(seekableInput.getRowType()),
              schema,
              factColOffset,
              lkpColOffset))
      .setRowSchema(schema);
}
 
Example #22
Source File: BeamValuesRel.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 0,
      "Should not have received input for %s: %s",
      BeamValuesRel.class.getSimpleName(),
      pinput);

  Schema schema = CalciteUtils.toSchema(getRowType());
  List<Row> rows = tuples.stream().map(tuple -> tupleToRow(schema, tuple)).collect(toList());
  return pinput.getPipeline().begin().apply(Create.of(rows).withRowSchema(schema));
}
 
Example #23
Source File: BatchLoads.java    From beam with Apache License 2.0 5 votes vote down vote up
PCollection<WriteBundlesToFiles.Result<DestinationT>> writeDynamicallyShardedFiles(
    PCollection<KV<DestinationT, ElementT>> input, PCollectionView<String> tempFilePrefix) {
  TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag =
      new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {};
  TupleTag<KV<ShardedKey<DestinationT>, ElementT>> unwrittedRecordsTag =
      new TupleTag<KV<ShardedKey<DestinationT>, ElementT>>("unwrittenRecords") {};
  PCollectionTuple writeBundlesTuple =
      input.apply(
          "WriteBundlesToFiles",
          ParDo.of(
                  new WriteBundlesToFiles<>(
                      tempFilePrefix,
                      unwrittedRecordsTag,
                      maxNumWritersPerBundle,
                      maxFileSize,
                      rowWriterFactory))
              .withSideInputs(tempFilePrefix)
              .withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
  PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles =
      writeBundlesTuple
          .get(writtenFilesTag)
          .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
  PCollection<KV<ShardedKey<DestinationT>, ElementT>> unwrittenRecords =
      writeBundlesTuple
          .get(unwrittedRecordsTag)
          .setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), elementCoder));

  // If the bundles contain too many output tables to be written inline to files (due to memory
  // limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
  // Group these records by key, and write the files after grouping. Since the record is grouped
  // by key, we can ensure that only one file is open at a time in each bundle.
  PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped =
      writeShardedRecords(unwrittenRecords, tempFilePrefix);

  // PCollection of filename, file byte size, and table destination.
  return PCollectionList.of(writtenFiles)
      .and(writtenFilesGrouped)
      .apply("FlattenFiles", Flatten.pCollections())
      .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
}
 
Example #24
Source File: AssignEventTime.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<InputT> expand(PCollectionList<InputT> inputs) {
  final PCollection<InputT> input = PCollectionLists.getOnlyElement(inputs);
  return FlatMap.named(getName().orElse(null))
      .of(input)
      .using(
          (InputT element, Collector<InputT> coll) -> coll.collect(element),
          input.getTypeDescriptor())
      .eventTimeBy(getEventTimeExtractor(), allowedTimestampSkew)
      .output();
}
 
Example #25
Source File: EmptyFlattenAsCreateFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void getInputEmptySucceeds() {
  PTransformReplacement<PCollectionList<Long>, PCollection<Long>> replacement =
      factory.getReplacementTransform(
          AppliedPTransform.of(
              "nonEmptyInput",
              Collections.emptyMap(),
              Collections.emptyMap(),
              Flatten.pCollections(),
              pipeline));
  assertThat(replacement.getInput().getAll(), emptyIterable());
}
 
Example #26
Source File: MapElements.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<OutputT> expand(PCollectionList<InputT> inputs) {
  return FlatMap.named(getName().orElse(null))
      .of(PCollectionLists.getOnlyElement(inputs))
      .using(
          (InputT elem, Collector<OutputT> coll) ->
              coll.collect(getMapper().apply(elem, coll.asContext())),
          getOutputType().orElse(null))
      .output();
}
 
Example #27
Source File: BeamSetOperatorRelBase.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> inputs) {
  checkArgument(
      inputs.size() == 2,
      "Wrong number of arguments to %s: %s",
      beamRelNode.getClass().getSimpleName(),
      inputs);
  PCollection<Row> leftRows = inputs.get(0);
  PCollection<Row> rightRows = inputs.get(1);

  WindowFn leftWindow = leftRows.getWindowingStrategy().getWindowFn();
  WindowFn rightWindow = rightRows.getWindowingStrategy().getWindowFn();
  if (!leftWindow.isCompatible(rightWindow)) {
    throw new IllegalArgumentException(
        "inputs of "
            + opType
            + " have different window strategy: "
            + leftWindow
            + " VS "
            + rightWindow);
  }

  // TODO: We may want to preaggregate the counts first using Group instead of calling CoGroup and
  // measuring the
  // iterable size. If on average there are duplicates in the input, this will be faster.
  final String lhsTag = "lhs";
  final String rhsTag = "rhs";
  PCollection<Row> joined =
      PCollectionTuple.of(lhsTag, leftRows, rhsTag, rightRows)
          .apply("CoGroup", CoGroup.join(By.fieldNames("*")));
  return joined
      .apply(
          "FilterResults",
          ParDo.of(
              new BeamSetOperatorsTransforms.SetOperatorFilteringDoFn(
                  lhsTag, rhsTag, opType, all)))
      .setRowSchema(joined.getSchema().getField("key").getType().getRowSchema());
}
 
Example #28
Source File: SingleInputOutputOverrideFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapOutputsMultipleOriginalOutputsFails() {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  PCollection<Integer> output = input.apply("Map", MapElements.via(fn));
  PCollection<Integer> reappliedOutput = input.apply("ReMap", MapElements.via(fn));
  thrown.expect(IllegalArgumentException.class);
  factory.mapOutputs(
      PCollectionList.of(output).and(input).and(reappliedOutput).expand(), reappliedOutput);
}
 
Example #29
Source File: FlattenTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(ValidatesRunner.class)
public void testFlattenInputMultipleCopies() {
  int count = 5;
  PCollection<Long> longs = p.apply("mkLines", GenerateSequence.from(0).to(count));
  PCollection<Long> biggerLongs =
      p.apply("mkOtherLines", GenerateSequence.from(0).to(count))
          .apply(
              MapElements.via(
                  new SimpleFunction<Long, Long>() {
                    @Override
                    public Long apply(Long input) {
                      return input + 10L;
                    }
                  }));

  PCollection<Long> flattened =
      PCollectionList.of(longs).and(longs).and(biggerLongs).apply(Flatten.pCollections());

  List<Long> expectedLongs = new ArrayList<>();
  for (int i = 0; i < count; i++) {
    // The duplicated input
    expectedLongs.add((long) i);
    expectedLongs.add((long) i);
    // The bigger longs
    expectedLongs.add(i + 10L);
  }
  PAssert.that(flattened).containsInAnyOrder(expectedLongs);

  p.run();
}
 
Example #30
Source File: Distinct.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<InputT> expand(PCollectionList<InputT> inputs) {
  PCollection<InputT> tmp = PCollectionLists.getOnlyElement(inputs);
  PCollection<InputT> input =
      getWindow()
          .map(
              w -> {
                PCollection<InputT> ret = tmp.apply(w);
                ret.setTypeDescriptor(tmp.getTypeDescriptor());
                return ret;
              })
          .orElse(tmp);
  if (!projected) {
    PCollection<KV<InputT, Void>> distinct =
        ReduceByKey.named(getName().orElse(null))
            .of(input)
            .keyBy(e -> e, input.getTypeDescriptor())
            .valueBy(e -> null, TypeDescriptors.nulls())
            .combineBy(e -> null, TypeDescriptors.nulls())
            .output();
    return MapElements.named(getName().orElse("") + "::extract-keys")
        .of(distinct)
        .using(KV::getKey, input.getTypeDescriptor())
        .output();
  }
  UnaryFunction<PCollection<InputT>, PCollection<InputT>> transformFn = getTransformFn();
  return transformFn.apply(input);
}