Java Code Examples for org.apache.beam.sdk.values.PCollectionList#get()

The following examples show how to use org.apache.beam.sdk.values.PCollectionList#get() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BeamUncollectRel.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 1,
      "Wrong number of inputs for %s: %s",
      BeamUncollectRel.class.getSimpleName(),
      pinput);
  PCollection<Row> upstream = pinput.get(0);

  // Each row of the input contains a single array of things to be emitted; Calcite knows
  // what the row looks like
  Schema outputSchema = CalciteUtils.toSchema(getRowType());

  PCollection<Row> uncollected =
      upstream.apply(ParDo.of(new UncollectDoFn(outputSchema))).setRowSchema(outputSchema);

  return uncollected;
}
 
Example 2
Source File: AbstractJoinTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<KeyT, OutputT>> translate(
    Join<LeftT, RightT, KeyT, OutputT> operator, PCollectionList<Object> inputs) {
  checkArgument(inputs.size() == 2, "Join expects exactly two inputs.");
  @SuppressWarnings("unchecked")
  final PCollection<LeftT> left = (PCollection) inputs.get(0);
  @SuppressWarnings("unchecked")
  final PCollection<RightT> right = (PCollection) inputs.get(1);
  PCollection<KV<KeyT, LeftT>> leftKeyed =
      left.apply(
          "extract-keys-left",
          new ExtractKey<>(
              operator.getLeftKeyExtractor(), TypeAwareness.orObjects(operator.getKeyType())));
  PCollection<KV<KeyT, RightT>> rightKeyed =
      right.apply(
          "extract-keys-right",
          new ExtractKey<>(
              operator.getRightKeyExtractor(), TypeAwareness.orObjects(operator.getKeyType())));
  // apply windowing if specified
  if (operator.getWindow().isPresent()) {
    @SuppressWarnings("unchecked")
    final Window<KV<KeyT, LeftT>> leftWindow = (Window) operator.getWindow().get();
    leftKeyed = leftKeyed.apply("window-left", leftWindow);
    @SuppressWarnings("unchecked")
    final Window<KV<KeyT, RightT>> rightWindow = (Window) operator.getWindow().get();
    rightKeyed = rightKeyed.apply("window-right", rightWindow);
  }

  return translate(operator, left, leftKeyed, right, rightKeyed)
      .setTypeDescriptor(
          operator
              .getOutputType()
              .orElseThrow(
                  () -> new IllegalStateException("Unable to infer output type descriptor.")));
}
 
Example 3
Source File: BeamIOSinkRel.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 1,
      "Wrong number of inputs for %s: %s",
      BeamIOSinkRel.class.getSimpleName(),
      pinput);
  PCollection<Row> input = pinput.get(0);

  sqlTable.buildIOWriter(input);

  return input;
}
 
Example 4
Source File: BeamSetOperatorRelBase.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> inputs) {
  checkArgument(
      inputs.size() == 2,
      "Wrong number of arguments to %s: %s",
      beamRelNode.getClass().getSimpleName(),
      inputs);
  PCollection<Row> leftRows = inputs.get(0);
  PCollection<Row> rightRows = inputs.get(1);

  WindowFn leftWindow = leftRows.getWindowingStrategy().getWindowFn();
  WindowFn rightWindow = rightRows.getWindowingStrategy().getWindowFn();
  if (!leftWindow.isCompatible(rightWindow)) {
    throw new IllegalArgumentException(
        "inputs of "
            + opType
            + " have different window strategy: "
            + leftWindow
            + " VS "
            + rightWindow);
  }

  // TODO: We may want to preaggregate the counts first using Group instead of calling CoGroup and
  // measuring the
  // iterable size. If on average there are duplicates in the input, this will be faster.
  final String lhsTag = "lhs";
  final String rhsTag = "rhs";
  PCollection<Row> joined =
      PCollectionTuple.of(lhsTag, leftRows, rhsTag, rightRows)
          .apply("CoGroup", CoGroup.join(By.fieldNames("*")));
  return joined
      .apply(
          "FilterResults",
          ParDo.of(
              new BeamSetOperatorsTransforms.SetOperatorFilteringDoFn(
                  lhsTag, rhsTag, opType, all)))
      .setRowSchema(joined.getSchema().getField("key").getType().getRowSchema());
}
 
Example 5
Source File: BeamUnnestRel.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  // The set of rows where we run the correlated unnest for each row
  PCollection<Row> outer = pinput.get(0);

  Schema joinedSchema = CalciteUtils.toSchema(rowType);

  return outer
      .apply(ParDo.of(new UnnestFn(joinedSchema, unnestIndex)))
      .setRowSchema(joinedSchema);
}
 
Example 6
Source File: BeamSideInputLookupJoinRel.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  Schema schema = CalciteUtils.toSchema(getRowType());

  BeamRelNode seekableRel =
      BeamSqlRelUtils.getBeamRelInput(getInput(seekableInputIndex().get()));
  BeamRelNode nonSeekableRel =
      BeamSqlRelUtils.getBeamRelInput(getInput(nonSeekableInputIndex().get()));

  // Offset field references according to which table is on the left
  int factColOffset =
      nonSeekableInputIndex().get() == 0
          ? 0
          : CalciteUtils.toSchema(seekableRel.getRowType()).getFieldCount();
  int lkpColOffset =
      seekableInputIndex().get() == 0
          ? 0
          : CalciteUtils.toSchema(nonSeekableRel.getRowType()).getFieldCount();

  // HACK: if the input is an immediate instance of a seekable IO, we can do lookups
  // so we ignore the PCollection
  BeamIOSourceRel seekableInput = (BeamIOSourceRel) seekableRel;
  BeamSqlSeekableTable seekableTable = (BeamSqlSeekableTable) seekableInput.getBeamSqlTable();

  // getPCollectionInputs() ensures that there is only one and it is the non-seekable input
  PCollection<Row> nonSeekableInput = pinput.get(0);

  return nonSeekableInput
      .apply(
          "join_as_lookup",
          new BeamJoinTransforms.JoinAsLookup(
              condition,
              seekableTable,
              CalciteUtils.toSchema(seekableInput.getRowType()),
              schema,
              factColOffset,
              lkpColOffset))
      .setRowSchema(schema);
}
 
Example 7
Source File: BeamZetaSqlCalcRel.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  Preconditions.checkArgument(
      pinput.size() == 1,
      "%s expected a single input PCollection, but received %d.",
      BeamZetaSqlCalcRel.class.getSimpleName(),
      pinput.size());
  PCollection<Row> upstream = pinput.get(0);

  final RexBuilder rexBuilder = getCluster().getRexBuilder();
  RexNode rex = rexBuilder.makeCall(SqlStdOperatorTable.ROW, getProgram().getProjectList());

  final RexNode condition = getProgram().getCondition();
  if (condition != null) {
    rex =
        rexBuilder.makeCall(
            SqlStdOperatorTable.CASE, condition, rex, rexBuilder.makeNullLiteral(getRowType()));
  }

  boolean verifyRowValues =
      pinput.getPipeline().getOptions().as(BeamSqlPipelineOptions.class).getVerifyRowValues();
  Schema outputSchema = CalciteUtils.toSchema(getRowType());
  CalcFn calcFn =
      new CalcFn(
          context.toSql(getProgram(), rex).toSqlString(DIALECT).getSql(),
          upstream.getSchema(),
          outputSchema,
          verifyRowValues);

  // validate prepared expressions
  calcFn.setup();

  return upstream.apply(ParDo.of(calcFn)).setRowSchema(outputSchema);
}
 
Example 8
Source File: BeamAggregationRel.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 1,
      "Wrong number of inputs for %s: %s",
      BeamAggregationRel.class.getSimpleName(),
      pinput);
  PCollection<Row> upstream = pinput.get(0);
  PCollection<Row> windowedStream = upstream;
  if (windowFn != null) {
    windowedStream = assignTimestampsAndWindow(upstream);
  }

  validateWindowIsSupported(windowedStream);

  org.apache.beam.sdk.schemas.transforms.Group.ByFields<Row> byFields =
      org.apache.beam.sdk.schemas.transforms.Group.byFieldIds(keyFieldsIds);
  org.apache.beam.sdk.schemas.transforms.Group.CombineFieldsByFields<Row> combined = null;
  for (FieldAggregation fieldAggregation : fieldAggregations) {
    List<Integer> inputs = fieldAggregation.inputs;
    CombineFn combineFn = fieldAggregation.combineFn;
    if (inputs.size() > 1 || inputs.isEmpty()) {
      // In this path we extract a Row (an empty row if inputs.isEmpty).
      combined =
          (combined == null)
              ? byFields.aggregateFieldsById(inputs, combineFn, fieldAggregation.outputField)
              : combined.aggregateFieldsById(inputs, combineFn, fieldAggregation.outputField);
    } else {
      // Combining over a single field, so extract just that field.
      combined =
          (combined == null)
              ? byFields.aggregateFieldBaseValue(
                  inputs.get(0), combineFn, fieldAggregation.outputField)
              : combined.aggregateFieldBaseValue(
                  inputs.get(0), combineFn, fieldAggregation.outputField);
    }
  }

  PTransform<PCollection<Row>, PCollection<Row>> combiner = combined;
  boolean ignoreValues = false;
  if (combiner == null) {
    // If no field aggregations were specified, we run a constant combiner that always returns
    // a single empty row for each key. This is used by the SELECT DISTINCT query plan - in this
    // case a group by is generated to determine unique keys, and a constant null combiner is
    // used.
    combiner =
        byFields.aggregateField(
            "*",
            AggregationCombineFnAdapter.createConstantCombineFn(),
            Field.of(
                "e",
                FieldType.row(AggregationCombineFnAdapter.EMPTY_SCHEMA).withNullable(true)));
    ignoreValues = true;
  }

  boolean verifyRowValues =
      pinput.getPipeline().getOptions().as(BeamSqlPipelineOptions.class).getVerifyRowValues();
  return windowedStream
      .apply(combiner)
      .apply(
          "mergeRecord",
          ParDo.of(mergeRecord(outputSchema, windowFieldIndex, ignoreValues, verifyRowValues)))
      .setRowSchema(outputSchema);
}
 
Example 9
Source File: BeamSortRel.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 1,
      "Wrong number of inputs for %s: %s",
      BeamIOSinkRel.class.getSimpleName(),
      pinput);
  PCollection<Row> upstream = pinput.get(0);

  // There is a need to separate ORDER BY LIMIT and LIMIT:
  //  - GroupByKey (used in Top) is not allowed on unbounded data in global window so ORDER BY
  // ... LIMIT
  //    works only on bounded data.
  //  - Just LIMIT operates on unbounded data, but across windows.
  if (fieldIndices.isEmpty()) {
    // TODO(https://issues.apache.org/jira/projects/BEAM/issues/BEAM-4702)
    // Figure out which operations are per-window and which are not.

    return upstream
        .apply(Window.into(new GlobalWindows()))
        .apply(new LimitTransform<>(startIndex))
        .setRowSchema(CalciteUtils.toSchema(getRowType()));
  } else {

    WindowingStrategy<?, ?> windowingStrategy = upstream.getWindowingStrategy();
    if (!(windowingStrategy.getWindowFn() instanceof GlobalWindows)) {
      throw new UnsupportedOperationException(
          String.format(
              "`ORDER BY` is only supported for %s, actual windowing strategy: %s",
              GlobalWindows.class.getSimpleName(), windowingStrategy));
    }

    ReversedBeamSqlRowComparator comparator =
        new ReversedBeamSqlRowComparator(fieldIndices, orientation, nullsFirst);

    // first find the top (offset + count)
    PCollection<List<Row>> rawStream =
        upstream
            .apply(
                "extractTopOffsetAndFetch",
                Top.of(startIndex + count, comparator).withoutDefaults())
            .setCoder(ListCoder.of(upstream.getCoder()));

    // strip the `leading offset`
    if (startIndex > 0) {
      rawStream =
          rawStream
              .apply(
                  "stripLeadingOffset",
                  ParDo.of(new SubListFn<>(startIndex, startIndex + count)))
              .setCoder(ListCoder.of(upstream.getCoder()));
    }

    return rawStream
        .apply("flatten", Flatten.iterables())
        .setRowSchema(CalciteUtils.toSchema(getRowType()));
  }
}