Java Code Examples for org.apache.beam.sdk.values.PCollection#isBounded()

The following examples show how to use org.apache.beam.sdk.values.PCollection#isBounded() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PipelineTranslator.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * @param beamNode the beam node to be translated.
 * @param pipeline pipeline.
 * @return true if the main input bounded.
 */
private static boolean isMainInputBounded(final TransformHierarchy.Node beamNode, final Pipeline pipeline) {
  final AppliedPTransform pTransform = beamNode.toAppliedPTransform(pipeline);
  final PCollection<?> mainInput = (PCollection<?>)
    Iterables.getOnlyElement(TransformInputs.nonAdditionalInputs(pTransform));
  return mainInput.isBounded() == PCollection.IsBounded.BOUNDED;
}
 
Example 2
Source File: BeamAggregationRel.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Performs the same check as {@link GroupByKey}, provides more context in exception.
 *
 * <p>Verifies that the input PCollection is bounded, or that there is windowing/triggering
 * being used. Without this, the watermark (at end of global window) will never be reached.
 *
 * <p>Throws {@link UnsupportedOperationException} if validation fails.
 */
private void validateWindowIsSupported(PCollection<Row> upstream) {
  WindowingStrategy<?, ?> windowingStrategy = upstream.getWindowingStrategy();
  if (windowingStrategy.getWindowFn() instanceof GlobalWindows
      && windowingStrategy.getTrigger() instanceof DefaultTrigger
      && upstream.isBounded() != BOUNDED) {

    throw new UnsupportedOperationException(
        "Please explicitly specify windowing in SQL query using HOP/TUMBLE/SESSION functions "
            + "(default trigger will be used in this case). "
            + "Unbounded input with global windowing and default trigger is not supported "
            + "in Beam SQL aggregations. "
            + "See GroupByKey section in Beam Programming Guide");
  }
}
 
Example 3
Source File: GroupByKey.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void applicableTo(PCollection<?> input) {
  WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
  // Verify that the input PCollection is bounded, or that there is windowing/triggering being
  // used. Without this, the watermark (at end of global window) will never be reached.
  if (windowingStrategy.getWindowFn() instanceof GlobalWindows
      && windowingStrategy.getTrigger() instanceof DefaultTrigger
      && input.isBounded() != IsBounded.BOUNDED) {
    throw new IllegalStateException(
        "GroupByKey cannot be applied to non-bounded PCollection in the GlobalWindow without a"
            + " trigger. Use a Window.into or Window.triggering transform prior to GroupByKey.");
  }

  // Validate the window merge function.
  if (windowingStrategy.getWindowFn() instanceof InvalidWindows) {
    String cause = ((InvalidWindows<?>) windowingStrategy.getWindowFn()).getCause();
    throw new IllegalStateException(
        "GroupByKey must have a valid Window merge function.  " + "Invalid because: " + cause);
  }

  // Validate that the trigger does not finish before garbage collection time
  if (!triggerIsSafe(windowingStrategy)) {
    throw new IllegalArgumentException(
        String.format(
            "Unsafe trigger '%s' may lose data, did you mean to wrap it in"
                + "`Repeatedly.forever(...)`?%nSee "
                + "https://s.apache.org/finishing-triggers-drop-data "
                + "for details.",
            windowingStrategy.getTrigger()));
  }
}
 
Example 4
Source File: BigQueryIO.java    From beam with Apache License 2.0 5 votes vote down vote up
private Method resolveMethod(PCollection<T> input) {
  if (getMethod() != Method.DEFAULT) {
    return getMethod();
  }
  // By default, when writing an Unbounded PCollection, we use StreamingInserts and
  // BigQuery's streaming import API.
  return (input.isBounded() == IsBounded.UNBOUNDED)
      ? Method.STREAMING_INSERTS
      : Method.FILE_LOADS;
}
 
Example 5
Source File: PubsubIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  if (getTopicProvider() == null) {
    throw new IllegalStateException("need to set the topic of a PubsubIO.Write transform");
  }

  switch (input.isBounded()) {
    case BOUNDED:
      input.apply(
          ParDo.of(
              new PubsubBoundedWriter(
                  MoreObjects.firstNonNull(getMaxBatchSize(), MAX_PUBLISH_BATCH_SIZE),
                  MoreObjects.firstNonNull(
                      getMaxBatchBytesSize(), MAX_PUBLISH_BATCH_BYTE_SIZE_DEFAULT))));
      return PDone.in(input.getPipeline());
    case UNBOUNDED:
      return input
          .apply(MapElements.into(new TypeDescriptor<PubsubMessage>() {}).via(getFormatFn()))
          .apply(
              new PubsubUnboundedSink(
                  getPubsubClientFactory(),
                  NestedValueProvider.of(getTopicProvider(), new TopicPathTranslator()),
                  getTimestampAttribute(),
                  getIdAttribute(),
                  100 /* numShards */,
                  MoreObjects.firstNonNull(
                      getMaxBatchSize(), PubsubUnboundedSink.DEFAULT_PUBLISH_BATCH_SIZE),
                  MoreObjects.firstNonNull(
                      getMaxBatchBytesSize(),
                      PubsubUnboundedSink.DEFAULT_PUBLISH_BATCH_BYTES)));
  }
  throw new RuntimeException(); // cases are exhaustive.
}
 
Example 6
Source File: SimpleRecordFormatAvroIO.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PDone write(PCollection<IndexedRecord> in) {
    LazyAvroKeyWrapper lakw = LazyAvroKeyWrapper.of();
    AvroHdfsFileSink sink = new AvroHdfsFileSink(doAs, path, overwrite, mergeOutput);
    sink.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration());

    PCollection<KV<AvroKey<IndexedRecord>, NullWritable>> pc1 = in.apply(ParDo.of(new FormatAvro()));
    pc1 = pc1.setCoder(KvCoder.of(lakw, WritableCoder.of(NullWritable.class)));

    if (in.isBounded() == PCollection.IsBounded.BOUNDED) {
        return pc1.apply(Write.to(sink));
    } else {
        return pc1.apply(UnboundedWrite.of(sink));
    }
}
 
Example 7
Source File: SimpleRecordFormatParquetIO.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PDone write(PCollection<IndexedRecord> in) {
    ParquetHdfsFileSink sink = new ParquetHdfsFileSink(doAs, path, overwrite, mergeOutput);
    sink.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration());

    PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(ParDo.of(new FormatParquet()));
    pc1 = pc1.setCoder(KvCoder.of(VoidCoder.of(), LazyAvroCoder.of()));
    if (in.isBounded() == PCollection.IsBounded.BOUNDED) {
        return pc1.apply(Write.to(sink));
    } else {
        return pc1.apply(UnboundedWrite.of(sink));
    }
}
 
Example 8
Source File: BeamSideInputJoinRel.java    From beam with Apache License 2.0 4 votes vote down vote up
public PCollection<Row> sideInputJoin(
    PCollection<Row> leftRows,
    PCollection<Row> rightRows,
    FieldAccessDescriptor leftKeyFields,
    FieldAccessDescriptor rightKeyFields) {
  // we always make the Unbounded table on the left to do the sideInput join
  // (will convert the result accordingly before return)
  boolean swapped = (leftRows.isBounded() == PCollection.IsBounded.BOUNDED);
  JoinRelType realJoinType = joinType;
  if (swapped && joinType != JoinRelType.INNER) {
    Preconditions.checkArgument(realJoinType != JoinRelType.LEFT);
    realJoinType = JoinRelType.LEFT;
  }

  PCollection<Row> realLeftRows = swapped ? rightRows : leftRows;
  PCollection<Row> realRightRows = swapped ? leftRows : rightRows;
  FieldAccessDescriptor realLeftKeyFields = swapped ? rightKeyFields : leftKeyFields;
  FieldAccessDescriptor realRightKeyFields = swapped ? leftKeyFields : rightKeyFields;

  PCollection<Row> joined;
  switch (realJoinType) {
    case INNER:
      joined =
          realLeftRows.apply(
              org.apache.beam.sdk.schemas.transforms.Join.<Row, Row>innerBroadcastJoin(
                      realRightRows)
                  .on(FieldsEqual.left(realLeftKeyFields).right(realRightKeyFields)));
      break;
    case LEFT:
      joined =
          realLeftRows.apply(
              org.apache.beam.sdk.schemas.transforms.Join.<Row, Row>leftOuterBroadcastJoin(
                      realRightRows)
                  .on(FieldsEqual.left(realLeftKeyFields).right(realRightKeyFields)));
      break;
    default:
      throw new RuntimeException("Unexpected join type " + realJoinType);
  }
  Schema schema = CalciteUtils.toSchema(getRowType());

  String lhsSelect = org.apache.beam.sdk.schemas.transforms.Join.LHS_TAG + ".*";
  String rhsSelect = org.apache.beam.sdk.schemas.transforms.Join.RHS_TAG + ".*";
  PCollection<Row> selected =
      (!swapped)
          ? joined.apply(Select.<Row>fieldNames(lhsSelect, rhsSelect).withOutputSchema(schema))
          : joined.apply(Select.<Row>fieldNames(rhsSelect, lhsSelect).withOutputSchema(schema));
  return selected;
}
 
Example 9
Source File: WriteFiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public WriteFilesResult<DestinationT> expand(PCollection<UserT> input) {
  if (input.isBounded() == IsBounded.UNBOUNDED) {
    checkArgument(
        getWindowedWrites(),
        "Must use windowed writes when applying %s to an unbounded PCollection",
        WriteFiles.class.getSimpleName());
    // The reason for this is https://issues.apache.org/jira/browse/BEAM-1438
    // and similar behavior in other runners.
    checkArgument(
        getComputeNumShards() != null || getNumShardsProvider() != null,
        "When applying %s to an unbounded PCollection, "
            + "must specify number of output shards explicitly",
        WriteFiles.class.getSimpleName());
  }
  this.writeOperation = getSink().createWriteOperation();
  this.writeOperation.setWindowedWrites(getWindowedWrites());

  if (!getWindowedWrites()) {
    // Re-window the data into the global window and remove any existing triggers.
    input =
        input.apply(
            "RewindowIntoGlobal",
            Window.<UserT>into(new GlobalWindows())
                .triggering(DefaultTrigger.of())
                .discardingFiredPanes());
  }

  Coder<DestinationT> destinationCoder;
  try {
    destinationCoder =
        getDynamicDestinations()
            .getDestinationCoderWithDefault(input.getPipeline().getCoderRegistry());
    destinationCoder.verifyDeterministic();
  } catch (CannotProvideCoderException | NonDeterministicException e) {
    throw new RuntimeException(e);
  }
  @SuppressWarnings("unchecked")
  Coder<BoundedWindow> windowCoder =
      (Coder<BoundedWindow>) input.getWindowingStrategy().getWindowFn().windowCoder();
  FileResultCoder<DestinationT> fileResultCoder =
      FileResultCoder.of(windowCoder, destinationCoder);

  PCollectionView<Integer> numShardsView =
      (getComputeNumShards() == null) ? null : input.apply(getComputeNumShards());

  PCollection<FileResult<DestinationT>> tempFileResults =
      (getComputeNumShards() == null && getNumShardsProvider() == null)
          ? input.apply(
              "WriteUnshardedBundlesToTempFiles",
              new WriteUnshardedBundlesToTempFiles(destinationCoder, fileResultCoder))
          : input.apply(
              "WriteShardedBundlesToTempFiles",
              new WriteShardedBundlesToTempFiles(
                  destinationCoder, fileResultCoder, numShardsView));

  return tempFileResults
      .apply("GatherTempFileResults", new GatherResults<>(fileResultCoder))
      .apply(
          "FinalizeTempFileBundles",
          new FinalizeTempFileBundles(numShardsView, destinationCoder));
}