Java Code Examples for org.apache.beam.sdk.values.PCollection#getSchema()

The following examples show how to use org.apache.beam.sdk.values.PCollection#getSchema() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BeamSqlDslAggregationNullableTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testAvgGroupByNullable() {
  String sql = "SELECT AVG(f_int1), f_int2 FROM PCOLLECTION GROUP BY f_int2";

  PCollection<Row> out = boundedInput.apply(SqlTransform.query(sql));
  Schema schema = out.getSchema();

  PAssert.that(out)
      .containsInAnyOrder(
          Row.withSchema(schema).addValues(null, null).build(),
          Row.withSchema(schema).addValues(2, 1).build(),
          Row.withSchema(schema).addValues(1, 5).build(),
          Row.withSchema(schema).addValues(3, 2).build());

  pipeline.run();
}
 
Example 2
Source File: BeamSqlDslAggregationNullableTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCountGroupByNullable() {
  String sql = "SELECT COUNT(f_int1) as c, f_int2 FROM PCOLLECTION GROUP BY f_int2";

  PCollection<Row> out = boundedInput.apply(SqlTransform.query(sql));
  Schema schema = out.getSchema();

  PAssert.that(out)
      .containsInAnyOrder(
          Row.withSchema(schema).addValues(0L, null).build(),
          Row.withSchema(schema).addValues(1L, 1).build(),
          Row.withSchema(schema).addValues(1L, 5).build(),
          Row.withSchema(schema).addValues(1L, 2).build());

  assertEquals(
      Schema.builder()
          // COUNT() is never nullable, and calcite knows it
          .addInt64Field("c")
          .addNullableField("f_int2", Schema.FieldType.INT32)
          .build(),
      schema);

  pipeline.run();
}
 
Example 3
Source File: ToJson.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<String> expand(PCollection<T> rows) {
  Schema inputSchema = rows.getSchema();
  // Throw exception if this schema is not supported by RowJson
  RowJson.verifySchemaSupported(inputSchema);
  SerializableFunction<T, Row> toRow = rows.getToRowFunction();
  return rows.apply(
      ParDo.of(
          new DoFn<T, String>() {
            @ProcessElement
            public void processElement(ProcessContext context) {
              context.output(
                  rowToJson(objectMapper(inputSchema), toRow.apply(context.element())));
            }
          }));
}
 
Example 4
Source File: Select.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollection<T> input) {
  Schema inputSchema = input.getSchema();
  FieldAccessDescriptor resolved = getFieldAccessDescriptor().resolve(inputSchema);
  Schema outputSchema = getOutputSchema();
  if (outputSchema == null) {
    outputSchema = SelectHelpers.getOutputSchema(inputSchema, resolved);
  } else {
    inputSchema = uniquifyNames(inputSchema);
    Schema inferredSchema = SelectHelpers.getOutputSchema(inputSchema, resolved);
    Preconditions.checkArgument(
        outputSchema.typesEqual(inferredSchema),
        "Types not equal. provided output schema: "
            + outputSchema
            + " Schema inferred from select: "
            + inferredSchema
            + " from input type: "
            + input.getSchema());
  }
  return input
      .apply(ParDo.of(new SelectDoFn<>(resolved, inputSchema, outputSchema)))
      .setRowSchema(outputSchema);
}
 
Example 5
Source File: Select.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollection<T> input) {
  Schema inputSchema = input.getSchema();

  FieldAccessDescriptor fieldAccessDescriptor =
      SelectHelpers.allLeavesDescriptor(
          inputSchema,
          n ->
              MoreObjects.firstNonNull(
                  getNameOverrides().get(String.join(".", n)), getNameFn().apply(n)));
  Schema inferredOutputSchema =
      SelectHelpers.getOutputSchema(inputSchema, fieldAccessDescriptor);
  Schema outputSchema = getOutputSchema();
  if (outputSchema != null) {
    Preconditions.checkArgument(outputSchema.typesEqual(inferredOutputSchema));
  } else {
    outputSchema = inferredOutputSchema;
  }
  return input
      .apply(ParDo.of(new SelectDoFn<>(fieldAccessDescriptor, inputSchema, outputSchema)))
      .setRowSchema(outputSchema);
}
 
Example 6
Source File: Group.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollection<InputT> input) {
  Schema schema = input.getSchema();
  Schema keySchema = getKeySchema(schema);
  Schema outputSchema =
      Schema.builder()
          .addRowField(getKeyField(), keySchema)
          .addIterableField(getValueField(), FieldType.row(schema))
          .build();

  return input
      .apply("ToKvs", getToKvs())
      .apply(
          "ToRow",
          ParDo.of(
              new DoFn<KV<Row, Iterable<Row>>, Row>() {
                @ProcessElement
                public void process(@Element KV<Row, Iterable<Row>> e, OutputReceiver<Row> o) {
                  o.output(
                      Row.withSchema(outputSchema)
                          .attachValues(Lists.newArrayList(e.getKey(), e.getValue())));
                }
              }))
      .setRowSchema(outputSchema);
}
 
Example 7
Source File: RenameFields.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollection<T> input) {
  Schema inputSchema = input.getSchema();

  List<RenamePair> pairs =
      renames.stream().map(r -> r.resolve(inputSchema)).collect(Collectors.toList());
  final Schema outputSchema = renameSchema(inputSchema, pairs);
  return input
      .apply(
          ParDo.of(
              new DoFn<T, Row>() {
                @ProcessElement
                public void processElement(@Element Row row, OutputReceiver<Row> o) {
                  o.output(Row.withSchema(outputSchema).attachValues(row.getValues()));
                }
              }))
      .setRowSchema(outputSchema);
}
 
Example 8
Source File: JdbcIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  // fixme: validate invalid table input
  if (input.hasSchema() && !hasStatementAndSetter()) {
    checkArgument(
        inner.getTable() != null, "table cannot be null if statement is not provided");
    Schema schema = input.getSchema();
    List<SchemaUtil.FieldWithIndex> fields = getFilteredFields(schema);
    inner =
        inner.withStatement(
            JdbcUtil.generateStatement(
                inner.getTable(),
                fields.stream()
                    .map(SchemaUtil.FieldWithIndex::getField)
                    .collect(Collectors.toList())));
    inner =
        inner.withPreparedStatementSetter(
            new AutoGeneratedPreparedStatementSetter(fields, input.getToRowFunction()));
  }

  inner.expand(input);
  return PDone.in(input.getPipeline());
}
 
Example 9
Source File: BeamPCollectionTable.java    From beam with Apache License 2.0 5 votes vote down vote up
public BeamPCollectionTable(PCollection<InputT> upstream) {
  super(upstream.getSchema());
  if (!upstream.hasSchema()) {
    throw new IllegalArgumentException("SQL can only run over PCollections that have schemas.");
  }
  this.upstream = upstream;
}
 
Example 10
Source File: BeamZetaSqlCalcRel.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  Preconditions.checkArgument(
      pinput.size() == 1,
      "%s expected a single input PCollection, but received %d.",
      BeamZetaSqlCalcRel.class.getSimpleName(),
      pinput.size());
  PCollection<Row> upstream = pinput.get(0);

  final RexBuilder rexBuilder = getCluster().getRexBuilder();
  RexNode rex = rexBuilder.makeCall(SqlStdOperatorTable.ROW, getProgram().getProjectList());

  final RexNode condition = getProgram().getCondition();
  if (condition != null) {
    rex =
        rexBuilder.makeCall(
            SqlStdOperatorTable.CASE, condition, rex, rexBuilder.makeNullLiteral(getRowType()));
  }

  boolean verifyRowValues =
      pinput.getPipeline().getOptions().as(BeamSqlPipelineOptions.class).getVerifyRowValues();
  Schema outputSchema = CalciteUtils.toSchema(getRowType());
  CalcFn calcFn =
      new CalcFn(
          context.toSql(getProgram(), rex).toSqlString(DIALECT).getSql(),
          upstream.getSchema(),
          outputSchema,
          verifyRowValues);

  // validate prepared expressions
  calcFn.setup();

  return upstream.apply(ParDo.of(calcFn)).setRowSchema(outputSchema);
}
 
Example 11
Source File: Group.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<Row, Iterable<Row>>> expand(PCollection<InputT> input) {
  Schema schema = input.getSchema();
  FieldAccessDescriptor resolved = getFieldAccessDescriptor().resolve(schema);
  rowSelector = new RowSelectorContainer(schema, resolved, true);
  Schema keySchema = getKeySchema(schema);

  return input
      .apply("toRow", Convert.toRows())
      .apply(
          "selectKeys",
          WithKeys.of((Row e) -> rowSelector.select(e)).withKeyType(TypeDescriptors.rows()))
      .setCoder(KvCoder.of(SchemaCoder.of(keySchema), SchemaCoder.of(schema)))
      .apply("GroupByKey", GroupByKey.create());
}
 
Example 12
Source File: DropFields.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollection<T> input) {
  Schema inputSchema = input.getSchema();
  FieldAccessDescriptor selectDescriptor =
      complement(inputSchema, fieldsToDrop.resolve(inputSchema));

  return input.apply(Select.fieldAccess(selectDescriptor));
}
 
Example 13
Source File: BigQueryChangeApplier.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Override
public PDone expand(PCollection<Row> input) {
  Pipeline p = input.getPipeline();
  Schema inputCollectionSchema = input.getSchema();

  PCollection<KV<String, KV<Schema, Schema>>> tableSchemaCollection =
      buildTableSchemaCollection(input);
  PCollectionView<Map<String, KV<Schema, Schema>>> schemaMapView = tableSchemaCollection
      .apply(View.asMap());

  PCollection<TableRow> updatesToWrite = formatIntoTableRows(input);

  updatesToWrite.apply(
      BigQueryIO.writeTableRows()
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND)
          .withMethod(Method.STREAMING_INSERTS)
      .to(new ChangelogTableDynamicDestinations(changeLogDataset, gcpProjectId, schemaMapView)));

  String jobPrefix =
      String.format(
          "beam_cdc_%s_%s_", gcpProjectId.replace(':', '_').replace('.', '_'), replicaDataset);

  // If the input collection does not have a primary key field, then we do not need to issue
  // periodic merge requests.
  if (inputCollectionSchema.hasField(DataflowCdcRowFormat.PRIMARY_KEY)) {
    p.apply("MergeHeartbeat",
        GenerateSequence
            .from(0)
            .withRate(1, Duration.standardSeconds(updateFrequencySeconds)))
        .apply("KeyByTable", ParDo.of(new KeySchemasByTableFn(schemaMapView))
            .withSideInputs(schemaMapView))
        .apply("BuildMergeStatements",
            ParDo.of(
                new MergeStatementBuildingFn(changeLogDataset, replicaDataset, gcpProjectId)))
        .setCoder(SerializableCoder.of(
            TypeDescriptors.kvs(
                TypeDescriptors.strings(),
                TypeDescriptor.of(BigQueryAction.class))))
        .apply("IssueMergeStatements",
            ParDo.of(new BigQueryStatementIssuingFn(jobPrefix)));
  }
  return PDone.in(p);
}
 
Example 14
Source File: CoGroup.java    From beam with Apache License 2.0 4 votes vote down vote up
private static JoinInformation from(
    PCollectionTuple input,
    Function<String, FieldAccessDescriptor> getFieldAccessDescriptor,
    Function<String, Boolean> getIsSideInput) {
  KeyedPCollectionTuple<Row> keyedPCollectionTuple =
      KeyedPCollectionTuple.empty(input.getPipeline());

  List<String> sortedTags =
      input.getAll().keySet().stream()
          .map(TupleTag::getId)
          .sorted()
          .collect(Collectors.toList());

  // Keep this in a TreeMap so that it's sorted. This way we get a deterministic output
  // schema.
  TreeMap<String, Schema> componentSchemas = Maps.newTreeMap();
  Map<Integer, SerializableFunction<Object, Row>> toRows = Maps.newHashMap();

  Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs = Maps.newHashMap();
  Map<Integer, String> tagToKeyedTag = Maps.newHashMap();
  Schema keySchema = null;
  for (Map.Entry<TupleTag<?>, PCollection<?>> entry : input.getAll().entrySet()) {
    String tag = entry.getKey().getId();
    int tagIndex = sortedTags.indexOf(tag);
    PCollection<?> pc = entry.getValue();
    Schema schema = pc.getSchema();
    componentSchemas.put(tag, schema);
    toRows.put(tagIndex, (SerializableFunction<Object, Row>) pc.getToRowFunction());
    FieldAccessDescriptor fieldAccessDescriptor = getFieldAccessDescriptor.apply(tag);
    if (fieldAccessDescriptor == null) {
      throw new IllegalStateException("No fields were set for input " + tag);
    }
    // Resolve the key schema, keeping the fields in the order specified by the user.
    // Otherwise, if different field names are specified for different PCollections, they
    // might not match up.
    // The key schema contains the field names from the first PCollection specified.
    FieldAccessDescriptor resolved = fieldAccessDescriptor.resolve(schema);
    Schema currentKeySchema = SelectHelpers.getOutputSchema(schema, resolved);
    if (keySchema == null) {
      keySchema = currentKeySchema;
    } else {
      keySchema = SchemaUtils.mergeWideningNullable(keySchema, currentKeySchema);
    }

    // Create a new tag for the output.
    TupleTag randomTag = new TupleTag<>();
    String keyedTag = tag + "_" + randomTag;
    tagToKeyedTag.put(tagIndex, keyedTag);
    PCollection<KV<Row, Row>> keyedPCollection =
        extractKey(pc, schema, keySchema, resolved, tag);
    if (getIsSideInput.apply(tag)) {
      sideInputs.put(
          keyedTag, keyedPCollection.apply("computeSideInputView" + tag, View.asMultimap()));
    } else {
      keyedPCollectionTuple = keyedPCollectionTuple.and(keyedTag, keyedPCollection);
    }
  }
  return new JoinInformation(
      keyedPCollectionTuple,
      sideInputs,
      keySchema,
      componentSchemas,
      toRows,
      sortedTags,
      tagToKeyedTag);
}