Java Code Examples for org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO#TypedRead

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO#TypedRead . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: BigQueryInputRuntime.java From components with Apache License 2.0

5 votes

@Override
public PCollection<IndexedRecord> expand(PBegin in) {
    BigQueryIO.TypedRead<TableRow> bigQueryIOPTransform;
    switch (dataset.sourceType.getValue()) {
    case TABLE_NAME: {
        TableReference table = new TableReference();
        table.setProjectId(datastore.projectName.getValue());
        table.setDatasetId(dataset.bqDataset.getValue());
        table.setTableId(dataset.tableName.getValue());
        // TODO use {@link #BigQueryIO.read(SerializableFunction)} instead of readTableRows for good performance
        // avoid redundance type convert, but take care of each filed type value when apply
        bigQueryIOPTransform = BigQueryIO.readTableRows().from(table);
        break;
    }
    case QUERY: {
        // TODO use {@link #BigQueryIO.read(SerializableFunction)} instead of readTableRows for good performance
        // reduce redundance type convert, but take care of each filed type value when apply
        bigQueryIOPTransform = BigQueryIO.readTableRows().fromQuery(dataset.query.getValue());
        if (!dataset.useLegacySql.getValue()) {
            bigQueryIOPTransform = bigQueryIOPTransform.usingStandardSql();
        } else {
            // need to consider flattenResults only for legacy sql,
            // stand sql don't support flatten result, legacy sql support flatten result by default
            // withoutResultFlattening on legacy sql is not working well till fix schema issue,
            // BigQueryDatasetRuntime.getSchema use flatten result indeed
            // bigQueryIOPTransform = bigQueryIOPTransform.withoutResultFlattening();
        }
        break;
    }
    default:
        throw new RuntimeException("To be implemented: " + dataset.sourceType.getValue());
    }

    return in
            .apply(bigQueryIOPTransform)
            .apply(ParDo.of(new TableRowToIndexedRecordFn(defaultOutputCoder.getSchema())))
            .setCoder(defaultOutputCoder);
}

Example 2

Source File: BeamBQInputTransform.java From hop with Apache License 2.0

4 votes

@Override public PCollection<HopRow> expand( PBegin input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamHop.init(transformPluginClasses, xpPluginClasses);

    // Function to convert from Avro to Hop rows
    //
    BQSchemaAndRecordToHopFn toHopFn = new BQSchemaAndRecordToHopFn( transformName, rowMetaJson, transformPluginClasses, xpPluginClasses );

    TableReference tableReference = new TableReference();
    if (StringUtils.isNotEmpty( projectId )) {
      tableReference.setProjectId( projectId );
    }
    tableReference.setDatasetId( datasetId );
    tableReference.setTableId( tableId );

    BigQueryIO.TypedRead<HopRow> bqTypedRead;

    if (StringUtils.isEmpty( query )) {
      bqTypedRead = BigQueryIO
        .read( toHopFn )
        .from( tableReference )
      ;
    } else {
      bqTypedRead = BigQueryIO
        .read( toHopFn )
        .fromQuery( query )
      ;
    }

    // Apply the function
    //
    PCollection<HopRow> output = input.apply( bqTypedRead );

    return output;

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in beam input transform", e );
    throw new RuntimeException( "Error in beam input transform", e );
  }
}

Example 3

Source File: BeamBQInputTransform.java From kettle-beam with Apache License 2.0

4 votes

@Override public PCollection<KettleRow> expand( PBegin input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamKettle.init(stepPluginClasses, xpPluginClasses);

    // Function to convert from Avro to Kettle rows
    //
    BQSchemaAndRecordToKettleFn toKettleFn = new BQSchemaAndRecordToKettleFn( stepname, rowMetaJson, stepPluginClasses, xpPluginClasses );

    TableReference tableReference = new TableReference();
    if (StringUtils.isNotEmpty( projectId )) {
      tableReference.setProjectId( projectId );
    }
    tableReference.setDatasetId( datasetId );
    tableReference.setTableId( tableId );

    BigQueryIO.TypedRead<KettleRow> bqTypedRead;

    if (StringUtils.isEmpty( query )) {
      bqTypedRead = BigQueryIO
        .read( toKettleFn )
        .from( tableReference )
      ;
    } else {
      bqTypedRead = BigQueryIO
        .read( toKettleFn )
        .fromQuery( query )
      ;
    }

    // Apply the function
    //
    PCollection<KettleRow> output = input.apply( bqTypedRead );

    return output;

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in beam input transform", e );
    throw new RuntimeException( "Error in beam input transform", e );
  }
}

Example 4

Source File: Read.java From gcp-ingestion with Mozilla Public License 2.0

4 votes

@Override
public PCollection<PubsubMessage> expand(PBegin input) {
  BigQueryIO.TypedRead<PubsubMessage> read = BigQueryIO
      .read((SchemaAndRecord schemaAndRecord) -> {
        TableSchema tableSchema = schemaAndRecord.getTableSchema();
        GenericRecord record = schemaAndRecord.getRecord();

        // We have to take care not to read additional bytes; see
        // https://github.com/mozilla/gcp-ingestion/issues/1266
        ByteBuffer byteBuffer = (ByteBuffer) record.get(FieldName.PAYLOAD);
        byte[] payload = new byte[byteBuffer.limit()];
        byteBuffer.get(payload);

        // We populate attributes for all simple string and timestamp fields, which is complete
        // for raw and error tables.
        // Decoded payload tables also have a top-level nested "metadata" struct; we can mostly
        // just drop this since the same metadata object is encoded in the payload, but we do
        // parse out the document namespace, type, and version since those are necessary in the
        // case of a Sink job that doesn't look at the payload but still may need those
        // attributes in order to route to the correct destination.
        Map<String, String> attributes = new HashMap<>();
        tableSchema.getFields().stream() //
            .filter(f -> !"REPEATED".equals(f.getMode())) //
            .forEach(f -> {
              Object value = record.get(f.getName());
              if (value != null) {
                switch (f.getType()) {
                  case "TIMESTAMP":
                    attributes.put(f.getName(), Time.epochMicrosToTimestamp((Long) value));
                    break;
                  case "STRING":
                  case "INTEGER":
                  case "INT64":
                    attributes.put(f.getName(), value.toString());
                    break;
                  case "RECORD":
                  case "STRUCT":
                    // The only struct we support is the top-level nested "metadata" and we
                    // extract only the attributes needed for destination routing.
                    GenericRecord metadata = (GenericRecord) value;
                    Arrays
                        .asList(Attribute.DOCUMENT_NAMESPACE, Attribute.DOCUMENT_TYPE,
                            Attribute.DOCUMENT_VERSION)
                        .forEach(v -> attributes.put(v, metadata.get(v).toString()));
                    break;
                  // Ignore any other types (only the payload BYTES field should hit this).
                  default:
                    break;
                }
              }
            });
        return new PubsubMessage(payload, attributes);
      }) //
      .withCoder(PubsubMessageWithAttributesCoder.of()) //
      .withTemplateCompatibility() //
      .withoutValidation() //
      .withMethod(method.method);
  switch (source) {
    case TABLE:
      read = read.from(tableSpec);
      break;
    default:
    case QUERY:
      read = read.fromQuery(tableSpec).usingStandardSql();
  }
  if (source == Source.TABLE && method == BigQueryReadMethod.storageapi) {
    if (rowRestriction != null) {
      read = read.withRowRestriction(rowRestriction);
    }
    if (selectedFields != null) {
      read = read.withSelectedFields(selectedFields);
    }
  }
  return input.apply(read);
}