Java Code Examples for org.apache.beam.sdk.schemas.Schema#hasField()

The following examples show how to use org.apache.beam.sdk.schemas.Schema#hasField() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: BigQueryChangeApplier.java From DataflowTemplates with Apache License 2.0

4 votes

@Override
public PDone expand(PCollection<Row> input) {
  Pipeline p = input.getPipeline();
  Schema inputCollectionSchema = input.getSchema();

  PCollection<KV<String, KV<Schema, Schema>>> tableSchemaCollection =
      buildTableSchemaCollection(input);
  PCollectionView<Map<String, KV<Schema, Schema>>> schemaMapView = tableSchemaCollection
      .apply(View.asMap());

  PCollection<TableRow> updatesToWrite = formatIntoTableRows(input);

  updatesToWrite.apply(
      BigQueryIO.writeTableRows()
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND)
          .withMethod(Method.STREAMING_INSERTS)
      .to(new ChangelogTableDynamicDestinations(changeLogDataset, gcpProjectId, schemaMapView)));

  String jobPrefix =
      String.format(
          "beam_cdc_%s_%s_", gcpProjectId.replace(':', '_').replace('.', '_'), replicaDataset);

  // If the input collection does not have a primary key field, then we do not need to issue
  // periodic merge requests.
  if (inputCollectionSchema.hasField(DataflowCdcRowFormat.PRIMARY_KEY)) {
    p.apply("MergeHeartbeat",
        GenerateSequence
            .from(0)
            .withRate(1, Duration.standardSeconds(updateFrequencySeconds)))
        .apply("KeyByTable", ParDo.of(new KeySchemasByTableFn(schemaMapView))
            .withSideInputs(schemaMapView))
        .apply("BuildMergeStatements",
            ParDo.of(
                new MergeStatementBuildingFn(changeLogDataset, replicaDataset, gcpProjectId)))
        .setCoder(SerializableCoder.of(
            TypeDescriptors.kvs(
                TypeDescriptors.strings(),
                TypeDescriptor.of(BigQueryAction.class))))
        .apply("IssueMergeStatements",
            ParDo.of(new BigQueryStatementIssuingFn(jobPrefix)));
  }
  return PDone.in(p);
}

Example 2

Source File: PubsubJsonTableProvider.java From beam with Apache License 2.0

4 votes

private boolean definesAttributeAndPayload(Schema schema) {
  return fieldPresent(
          schema, ATTRIBUTES_FIELD, Schema.FieldType.map(VARCHAR.withNullable(false), VARCHAR))
      && (schema.hasField(PAYLOAD_FIELD)
          && ROW.equals(schema.getField(PAYLOAD_FIELD).getType().getTypeName()));
}

Example 3

Source File: PubsubJsonTableProvider.java From beam with Apache License 2.0

4 votes

private boolean fieldPresent(Schema schema, String field, Schema.FieldType expectedType) {
  return schema.hasField(field)
      && expectedType.equivalent(
          schema.getField(field).getType(), Schema.EquivalenceNullablePolicy.IGNORE);
}

Example 4

Source File: AddFields.java From beam with Apache License 2.0

4 votes

private static AddFieldsInformation getAddFieldsInformation(
    Schema inputSchema, Collection<NewField> fieldsToAdd) {
  List<NewField> newTopLevelFields =
      fieldsToAdd.stream()
          .filter(n -> !n.getDescriptor().getFieldsAccessed().isEmpty())
          .collect(Collectors.toList());
  List<NewField> newNestedFields =
      fieldsToAdd.stream()
          .filter(n -> !n.getDescriptor().getNestedFieldsAccessed().isEmpty())
          .collect(Collectors.toList());
  // Group all nested fields together by the field at the current level. For example, if adding
  // a.b, a.c, a.d
  // this map will contain a -> {a.b, a.c, a.d}.
  Multimap<String, NewField> newNestedFieldsMap =
      Multimaps.index(newNestedFields, NewField::getName);

  Map<Integer, AddFieldsInformation> resolvedNestedNewValues = Maps.newHashMap();
  Schema.Builder builder = Schema.builder();
  for (int i = 0; i < inputSchema.getFieldCount(); ++i) {
    Schema.Field field = inputSchema.getField(i);
    Collection<NewField> nestedFields = newNestedFieldsMap.get(field.getName());

    // If this field is a nested field and new subfields are added further down the tree, add
    // those subfields before
    // adding to the current schema. Otherwise we just add this field as is to the new schema.
    if (!nestedFields.isEmpty()) {
      nestedFields = nestedFields.stream().map(NewField::descend).collect(Collectors.toList());

      AddFieldsInformation nestedInformation =
          getAddFieldsInformation(field.getType(), nestedFields);
      field = field.withType(nestedInformation.getOutputFieldType());
      resolvedNestedNewValues.put(i, nestedInformation);
    }
    builder.addField(field);
  }

  // Add any new fields at this level.
  List<Object> newValuesThisLevel = new ArrayList<>(newTopLevelFields.size());
  for (NewField newField : newTopLevelFields) {
    builder.addField(newField.getName(), newField.getFieldType());
    newValuesThisLevel.add(newField.getDefaultValue());
  }

  // If there are any nested field additions left that are not already processed, that means
  // that the root of the
  // nested field doesn't exist in the schema. In this case we'll walk down the new nested
  // fields and recursively create each nested level as necessary.
  for (Map.Entry<String, Collection<NewField>> newNested :
      newNestedFieldsMap.asMap().entrySet()) {
    String fieldName = newNested.getKey();

    // If the user specifies the same nested field twice in different ways (e.g. a[].x, a{}.x)
    FieldAccessDescriptor.FieldDescriptor fieldDescriptor =
        Iterables.getOnlyElement(
            newNested.getValue().stream()
                .map(NewField::getFieldDescriptor)
                .distinct()
                .collect(Collectors.toList()));
    FieldType fieldType = Schema.FieldType.row(Schema.of()).withNullable(true);
    for (Qualifier qualifier : fieldDescriptor.getQualifiers()) {
      // The problem with adding recursive map fields is that we don't know what the map key
      // type should be.
      // In a field descriptor of the form mapField{}.subField, the subField is assumed to be in
      // the map value.
      // Since in this code path the mapField field does not already exist this means we need to
      // create the new
      // map field, and we have no way of knowing what type the key should be.
      // Alternatives would be to always create a default key type (e.g. FieldType.STRING) or
      // extend our selector
      // syntax to allow specifying key types.
      checkArgument(
          !qualifier.getKind().equals(Qualifier.Kind.MAP), "Map qualifiers not supported here");
      fieldType = FieldType.array(fieldType).withNullable(true);
    }
    if (!inputSchema.hasField(fieldName)) {
      // This is a brand-new nested field with no matching field in the input schema. We will
      // recursively create a nested schema to match it.
      Collection<NewField> nestedNewFields =
          newNested.getValue().stream().map(NewField::descend).collect(Collectors.toList());
      AddFieldsInformation addFieldsInformation =
          getAddFieldsInformation(fieldType, nestedNewFields);
      builder.addField(fieldName, addFieldsInformation.getOutputFieldType());
      resolvedNestedNewValues.put(builder.getLastFieldId(), addFieldsInformation);
    }
  }
  Schema schema = builder.build();

  List<AddFieldsInformation> nestedNewValueList =
      new ArrayList<>(Collections.nCopies(schema.getFieldCount(), null));
  for (Map.Entry<Integer, AddFieldsInformation> entry : resolvedNestedNewValues.entrySet()) {
    nestedNewValueList.set(entry.getKey(), entry.getValue());
  }
  return AddFieldsInformation.of(
      Schema.FieldType.row(schema), newValuesThisLevel, nestedNewValueList);
}