org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition Java Examples

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BigQueryDeadletterSink.java    From feast with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<FailedElement> input) {
  TimePartitioning partition = new TimePartitioning().setType("DAY");
  partition.setField(TIMESTAMP_COLUMN);
  input
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withTimePartitioning(partition)
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(input.getPipeline());
}
 
Example #2
Source File: FakeJobService.java    From beam with Apache License 2.0 6 votes vote down vote up
private boolean validateDispositions(
    Table table, CreateDisposition createDisposition, WriteDisposition writeDisposition)
    throws InterruptedException, IOException {
  if (table == null) {
    if (createDisposition == CreateDisposition.CREATE_NEVER) {
      return false;
    }
  } else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
    datasetService.deleteTable(table.getTableReference());
  } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
    List<TableRow> allRows =
        datasetService.getAllRows(
            table.getTableReference().getProjectId(),
            table.getTableReference().getDatasetId(),
            table.getTableReference().getTableId());
    if (!allRows.isEmpty()) {
      return false;
    }
  }
  return true;
}
 
Example #3
Source File: WriteRename.java    From beam with Apache License 2.0 5 votes vote down vote up
public WriteRename(
    BigQueryServices bqServices,
    PCollectionView<String> jobIdToken,
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    int maxRetryJobs,
    String kmsKey) {
  this.bqServices = bqServices;
  this.jobIdToken = jobIdToken;
  this.firstPaneWriteDisposition = writeDisposition;
  this.firstPaneCreateDisposition = createDisposition;
  this.maxRetryJobs = maxRetryJobs;
  this.kmsKey = kmsKey;
}
 
Example #4
Source File: WriteRename.java    From beam with Apache License 2.0 5 votes vote down vote up
private PendingJobData startWriteRename(
    TableDestination finalTableDestination, Iterable<String> tempTableNames, ProcessContext c)
    throws Exception {
  WriteDisposition writeDisposition =
      (c.pane().getIndex() == 0) ? firstPaneWriteDisposition : WriteDisposition.WRITE_APPEND;
  CreateDisposition createDisposition =
      (c.pane().getIndex() == 0) ? firstPaneCreateDisposition : CreateDisposition.CREATE_NEVER;
  List<TableReference> tempTables =
      StreamSupport.stream(tempTableNames.spliterator(), false)
          .map(table -> BigQueryHelpers.fromJsonString(table, TableReference.class))
          .collect(Collectors.toList());
  ;

  // Make sure each destination table gets a unique job id.
  String jobIdPrefix =
      BigQueryHelpers.createJobId(
          c.sideInput(jobIdToken), finalTableDestination, -1, c.pane().getIndex());

  BigQueryHelpers.PendingJob retryJob =
      startCopy(
          bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
          bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
          jobIdPrefix,
          finalTableDestination.getTableReference(),
          tempTables,
          writeDisposition,
          createDisposition,
          kmsKey);
  return new PendingJobData(retryJob, finalTableDestination, tempTables);
}
 
Example #5
Source File: BatchLoads.java    From beam with Apache License 2.0 5 votes vote down vote up
BatchLoads(
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    boolean singletonTable,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    Coder<DestinationT> destinationCoder,
    ValueProvider<String> customGcsTempLocation,
    @Nullable ValueProvider<String> loadJobProjectId,
    boolean ignoreUnknownValues,
    Coder<ElementT> elementCoder,
    RowWriterFactory<ElementT, DestinationT> rowWriterFactory,
    @Nullable String kmsKey,
    boolean clusteringEnabled,
    boolean useAvroLogicalTypes) {
  bigQueryServices = new BigQueryServicesImpl();
  this.writeDisposition = writeDisposition;
  this.createDisposition = createDisposition;
  this.singletonTable = singletonTable;
  this.dynamicDestinations = dynamicDestinations;
  this.destinationCoder = destinationCoder;
  this.maxNumWritersPerBundle = DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE;
  this.maxFileSize = DEFAULT_MAX_FILE_SIZE;
  this.numFileShards = DEFAULT_NUM_FILE_SHARDS;
  this.maxFilesPerPartition = DEFAULT_MAX_FILES_PER_PARTITION;
  this.maxBytesPerPartition = DEFAULT_MAX_BYTES_PER_PARTITION;
  this.triggeringFrequency = null;
  this.customGcsTempLocation = customGcsTempLocation;
  this.loadJobProjectId = loadJobProjectId;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.useAvroLogicalTypes = useAvroLogicalTypes;
  this.elementCoder = elementCoder;
  this.kmsKey = kmsKey;
  this.rowWriterFactory = rowWriterFactory;
  this.clusteringEnabled = clusteringEnabled;
  schemaUpdateOptions = Collections.emptySet();
}
 
Example #6
Source File: BigQueryTableProviderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSelectWriteDispositionMethodEmpty() {
  Table table =
      fakeTableWithProperties(
          "hello",
          "{ "
              + WRITE_DISPOSITION_PROPERTY
              + ": "
              + "\""
              + WriteDisposition.WRITE_EMPTY.toString()
              + "\" }");
  BigQueryTable sqlTable = (BigQueryTable) provider.buildBeamSqlTable(table);

  assertEquals(WriteDisposition.WRITE_EMPTY, sqlTable.writeDisposition);
}
 
Example #7
Source File: BigQueryTableProviderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSelectWriteDispositionMethodAppend() {
  Table table =
      fakeTableWithProperties(
          "hello",
          "{ "
              + WRITE_DISPOSITION_PROPERTY
              + ": "
              + "\""
              + WriteDisposition.WRITE_APPEND.toString()
              + "\" }");
  BigQueryTable sqlTable = (BigQueryTable) provider.buildBeamSqlTable(table);

  assertEquals(WriteDisposition.WRITE_APPEND, sqlTable.writeDisposition);
}
 
Example #8
Source File: BigQueryTableProviderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSelectWriteDispositionMethodTruncate() {
  Table table =
      fakeTableWithProperties(
          "hello",
          "{ "
              + WRITE_DISPOSITION_PROPERTY
              + ": "
              + "\""
              + WriteDisposition.WRITE_TRUNCATE.toString()
              + "\" }");
  BigQueryTable sqlTable = (BigQueryTable) provider.buildBeamSqlTable(table);

  assertEquals(WriteDisposition.WRITE_TRUNCATE, sqlTable.writeDisposition);
}
 
Example #9
Source File: WriteWindowedToBigQuery.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #10
Source File: WriteToBigQuery.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #11
Source File: WriteTables.java    From beam with Apache License 2.0 5 votes vote down vote up
public WriteTables(
    boolean tempTable,
    BigQueryServices bqServices,
    PCollectionView<String> loadJobIdPrefixView,
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    List<PCollectionView<?>> sideInputs,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    @Nullable ValueProvider<String> loadJobProjectId,
    int maxRetryJobs,
    boolean ignoreUnknownValues,
    String kmsKey,
    String sourceFormat,
    boolean useAvroLogicalTypes,
    Set<SchemaUpdateOption> schemaUpdateOptions) {

  this.tempTable = tempTable;
  this.bqServices = bqServices;
  this.loadJobIdPrefixView = loadJobIdPrefixView;
  this.firstPaneWriteDisposition = writeDisposition;
  this.firstPaneCreateDisposition = createDisposition;
  this.sideInputs = sideInputs;
  this.dynamicDestinations = dynamicDestinations;
  this.mainOutputTag = new TupleTag<>("WriteTablesMainOutput");
  this.temporaryFilesTag = new TupleTag<>("TemporaryFiles");
  this.loadJobProjectId = loadJobProjectId;
  this.maxRetryJobs = maxRetryJobs;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.kmsKey = kmsKey;
  this.sourceFormat = sourceFormat;
  this.useAvroLogicalTypes = useAvroLogicalTypes;
  this.schemaUpdateOptions = schemaUpdateOptions;
}
 
Example #12
Source File: DatastoreToBigQuery.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF that returns JSON that conforms to the BigQuery TableRow spec and writes
 * the TableRows to BigQuery.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToBigQueryOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToBigQueryOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(
          ReadJsonEntities.newBuilder()
              .setGqlQuery(options.getDatastoreReadGqlQuery())
              .setProjectId(options.getDatastoreReadProjectId())
              .setNamespace(options.getDatastoreReadNamespace())
              .build())
      .apply(
          TransformTextViaJavascript.newBuilder()
              .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setFunctionName(options.getJavascriptTextTransformFunctionName())
              .build())
      .apply(BigQueryConverters.jsonToTableRow())
      .apply(
          "WriteBigQuery",
          BigQueryIO.writeTableRows()
              .withoutValidation()
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER)
              .to(options.getOutputTableSpec())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)
              .withCustomGcsTempLocation(options.getBigQueryLoadingTemporaryDirectory()));

  pipeline.run();
}
 
Example #13
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #14
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #15
Source File: KafkaToBigQuery.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #16
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #17
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #18
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #19
Source File: WriteFailedElementToBigQuery.java    From feast with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailedElement> failedElements) {
  return failedElements
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #20
Source File: WriteWindowedToBigQuery.java    From deployment-examples with MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #21
Source File: WriteToBigQuery.java    From deployment-examples with MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #22
Source File: FakeJobService.java    From beam with Apache License 2.0 4 votes vote down vote up
private JobStatus runCopyJob(JobConfigurationTableCopy copy)
    throws InterruptedException, IOException {
  List<TableReference> sources = copy.getSourceTables();
  TableReference destination = copy.getDestinationTable();
  WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition());
  CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition());
  Table existingTable = datasetService.getTable(destination);
  if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
    return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
  }
  TimePartitioning partitioning = null;
  Clustering clustering = null;
  TableSchema schema = null;
  boolean first = true;
  List<TableRow> allRows = Lists.newArrayList();
  for (TableReference source : sources) {
    Table table = checkNotNull(datasetService.getTable(source));
    if (!first) {
      if (!Objects.equals(partitioning, table.getTimePartitioning())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
      if (!Objects.equals(clustering, table.getClustering())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
      if (!Objects.equals(schema, table.getSchema())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
    }
    partitioning = table.getTimePartitioning();
    clustering = table.getClustering();
    schema = table.getSchema();
    first = false;
    allRows.addAll(
        datasetService.getAllRows(
            source.getProjectId(), source.getDatasetId(), source.getTableId()));
  }
  datasetService.createTable(
      new Table()
          .setTableReference(destination)
          .setSchema(schema)
          .setTimePartitioning(partitioning)
          .setClustering(clustering)
          .setEncryptionConfiguration(copy.getDestinationEncryptionConfiguration()));
  datasetService.insertAll(destination, allRows, null);
  return new JobStatus().setState("DONE");
}
 
Example #23
Source File: FakeJobService.java    From beam with Apache License 2.0 4 votes vote down vote up
private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load)
    throws InterruptedException, IOException {
  TableReference destination = load.getDestinationTable();
  TableSchema schema = load.getSchema();
  checkArgument(schema != null, "No schema specified");
  List<ResourceId> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId());
  WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition());
  CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition());

  Table existingTable = datasetService.getTable(destination);
  if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
    return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
  }
  if (existingTable == null) {
    TableReference strippedDestination =
        destination
            .clone()
            .setTableId(BigQueryHelpers.stripPartitionDecorator(destination.getTableId()));
    existingTable = new Table().setTableReference(strippedDestination).setSchema(schema);
    if (load.getTimePartitioning() != null) {
      existingTable = existingTable.setTimePartitioning(load.getTimePartitioning());
    }
    if (load.getClustering() != null) {
      existingTable = existingTable.setClustering(load.getClustering());
    }
    datasetService.createTable(existingTable);
  }

  List<TableRow> rows = Lists.newArrayList();
  for (ResourceId filename : sourceFiles) {
    if (load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON")) {
      rows.addAll(readJsonTableRows(filename.toString()));
    } else if (load.getSourceFormat().equals("AVRO")) {
      rows.addAll(readAvroTableRows(filename.toString(), schema));
    }
  }

  datasetService.insertAll(destination, rows, null);
  FileSystems.delete(sourceFiles);
  return new JobStatus().setState("DONE");
}
 
Example #24
Source File: BigQueryKmsKey.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  // [START dataflow_cmek]
  // Query from the NASA wildfires public dataset:
  // https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=nasa_wildfire&t=past_week&page=table
  String query =
      "SELECT latitude,longitude,acq_date,acq_time,bright_ti4,confidence " +
      "FROM `bigquery-public-data.nasa_wildfire.past_week` " +
      "LIMIT 10";

  // Schema for the output BigQuery table.
  final TableSchema outputSchema = new TableSchema().setFields(Arrays.asList(
      new TableFieldSchema().setName("latitude").setType("FLOAT"),
      new TableFieldSchema().setName("longitude").setType("FLOAT"),
      new TableFieldSchema().setName("acq_date").setType("DATE"),
      new TableFieldSchema().setName("acq_time").setType("TIME"),
      new TableFieldSchema().setName("bright_ti4").setType("FLOAT"),
      new TableFieldSchema().setName("confidence").setType("STRING")));

  // Create the BigQuery options from the command line arguments.
  BigQueryKmsKeyOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation().as(BigQueryKmsKeyOptions.class);

  // String outputBigQueryTable = "<project>:<dataset>.<table>";
  String outputBigQueryTable = options.getOutputBigQueryTable();

  // String kmsKey = "projects/<project>/locations/<kms-location>/keyRings/<kms-keyring>/cryptoKeys/<kms-key>";
  String kmsKey = options.getKmsKey();
  
  // Create and run an Apache Beam pipeline.
  Pipeline pipeline = Pipeline.create(options);
  pipeline
      .apply("Read from BigQuery with KMS key",
          BigQueryIO.readTableRows()
              .fromQuery(query)
              .usingStandardSql()
              .withKmsKey(kmsKey))
      .apply("Write to BigQuery with KMS key",
          BigQueryIO.writeTableRows()
              .to(outputBigQueryTable)
              .withSchema(outputSchema)
              .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)
              .withKmsKey(kmsKey));
  pipeline.run().waitUntilFinish();
  // [START dataflow_cmek]
}
 
Example #25
Source File: WriteTables.java    From beam with Apache License 2.0 4 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c, BoundedWindow window) throws Exception {
  dynamicDestinations.setSideInputAccessorFromProcessContext(c);
  DestinationT destination = c.element().getKey().getKey();
  TableSchema tableSchema;
  if (firstPaneCreateDisposition == CreateDisposition.CREATE_NEVER) {
    tableSchema = null;
  } else if (jsonSchemas.containsKey(destination)) {
    tableSchema =
        BigQueryHelpers.fromJsonString(jsonSchemas.get(destination), TableSchema.class);
  } else {
    tableSchema = dynamicDestinations.getSchema(destination);
    checkArgument(
        tableSchema != null,
        "Unless create disposition is %s, a schema must be specified, i.e. "
            + "DynamicDestinations.getSchema() may not return null. "
            + "However, create disposition is %s, and %s returned null for destination %s",
        CreateDisposition.CREATE_NEVER,
        firstPaneCreateDisposition,
        dynamicDestinations,
        destination);
    jsonSchemas.put(destination, BigQueryHelpers.toJsonString(tableSchema));
  }

  TableDestination tableDestination = dynamicDestinations.getTable(destination);
  checkArgument(
      tableDestination != null,
      "DynamicDestinations.getTable() may not return null, "
          + "but %s returned null for destination %s",
      dynamicDestinations,
      destination);
  boolean destinationCoderSupportsClustering =
      !(dynamicDestinations.getDestinationCoder() instanceof TableDestinationCoderV2);
  checkArgument(
      tableDestination.getClustering() == null || destinationCoderSupportsClustering,
      "DynamicDestinations.getTable() may only return destinations with clustering configured"
          + " if a destination coder is supplied that supports clustering, but %s is configured"
          + " to use TableDestinationCoderV2. Set withClustering() on BigQueryIO.write() and, "
          + " if you provided a custom DynamicDestinations instance, override"
          + " getDestinationCoder() to return TableDestinationCoderV3.",
      dynamicDestinations);
  TableReference tableReference = tableDestination.getTableReference();
  if (Strings.isNullOrEmpty(tableReference.getProjectId())) {
    tableReference.setProjectId(c.getPipelineOptions().as(BigQueryOptions.class).getProject());
    tableDestination = tableDestination.withTableReference(tableReference);
  }

  Integer partition = c.element().getKey().getShardNumber();
  List<String> partitionFiles = Lists.newArrayList(c.element().getValue());
  String jobIdPrefix =
      BigQueryHelpers.createJobId(
          c.sideInput(loadJobIdPrefixView), tableDestination, partition, c.pane().getIndex());

  if (tempTable) {
    // This is a temp table. Create a new one for each partition and each pane.
    tableReference.setTableId(jobIdPrefix);
  }

  WriteDisposition writeDisposition = firstPaneWriteDisposition;
  CreateDisposition createDisposition = firstPaneCreateDisposition;
  if (c.pane().getIndex() > 0 && !tempTable) {
    // If writing directly to the destination, then the table is created on the first write
    // and we should change the disposition for subsequent writes.
    writeDisposition = WriteDisposition.WRITE_APPEND;
    createDisposition = CreateDisposition.CREATE_NEVER;
  } else if (tempTable) {
    // In this case, we are writing to a temp table and always need to create it.
    // WRITE_TRUNCATE is set so that we properly handle retries of this pane.
    writeDisposition = WriteDisposition.WRITE_TRUNCATE;
    createDisposition = CreateDisposition.CREATE_IF_NEEDED;
  }

  BigQueryHelpers.PendingJob retryJob =
      startLoad(
          bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
          bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
          jobIdPrefix,
          tableReference,
          tableDestination.getTimePartitioning(),
          tableDestination.getClustering(),
          tableSchema,
          partitionFiles,
          writeDisposition,
          createDisposition,
          schemaUpdateOptions);
  pendingJobs.add(
      new PendingJobData(window, retryJob, partitionFiles, tableDestination, tableReference));
}
 
Example #26
Source File: BatchLoads.java    From beam with Apache License 2.0 4 votes vote down vote up
private PCollection<KV<TableDestination, String>> writeTempTables(
    PCollection<KV<ShardedKey<DestinationT>, List<String>>> input,
    PCollectionView<String> jobIdTokenView) {
  List<PCollectionView<?>> sideInputs = Lists.newArrayList(jobIdTokenView);
  sideInputs.addAll(dynamicDestinations.getSideInputs());

  Coder<KV<ShardedKey<DestinationT>, List<String>>> partitionsCoder =
      KvCoder.of(
          ShardedKeyCoder.of(NullableCoder.of(destinationCoder)),
          ListCoder.of(StringUtf8Coder.of()));

  // If the final destination table exists already (and we're appending to it), then the temp
  // tables must exactly match schema, partitioning, etc. Wrap the DynamicDestinations object
  // with one that makes this happen.
  @SuppressWarnings("unchecked")
  DynamicDestinations<?, DestinationT> destinations = dynamicDestinations;
  if (createDisposition.equals(CreateDisposition.CREATE_IF_NEEDED)
      || createDisposition.equals(CreateDisposition.CREATE_NEVER)) {
    destinations =
        DynamicDestinationsHelpers.matchTableDynamicDestinations(destinations, bigQueryServices);
  }

  Coder<TableDestination> tableDestinationCoder =
      clusteringEnabled ? TableDestinationCoderV3.of() : TableDestinationCoderV2.of();

  // If WriteBundlesToFiles produced more than DEFAULT_MAX_FILES_PER_PARTITION files or
  // DEFAULT_MAX_BYTES_PER_PARTITION bytes, then
  // the import needs to be split into multiple partitions, and those partitions will be
  // specified in multiPartitionsTag.
  return input
      .setCoder(partitionsCoder)
      // Reshuffle will distribute this among multiple workers, and also guard against
      // reexecution of the WritePartitions step once WriteTables has begun.
      .apply("MultiPartitionsReshuffle", Reshuffle.of())
      .apply(
          "MultiPartitionsWriteTables",
          new WriteTables<>(
              true,
              bigQueryServices,
              jobIdTokenView,
              WriteDisposition.WRITE_EMPTY,
              CreateDisposition.CREATE_IF_NEEDED,
              sideInputs,
              destinations,
              loadJobProjectId,
              maxRetryJobs,
              ignoreUnknownValues,
              kmsKey,
              rowWriterFactory.getSourceFormat(),
              useAvroLogicalTypes,
              schemaUpdateOptions))
      .setCoder(KvCoder.of(tableDestinationCoder, StringUtf8Coder.of()));
}
 
Example #27
Source File: StreamingBeamSQL.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var project = options.as(GcpOptions.class).getProject();
  var subscription = ProjectSubscriptionName.of(project, options.getInputSubscription()).toString();

  var schema = Schema.builder()
      .addStringField("url")
      .addDoubleField("page_score")
      .addDateTimeField("processing_time")
      .build();

  var pipeline = Pipeline.create(options);
  pipeline
      // Read, parse, and validate messages from Pub/Sub.
      .apply("Read messages from Pub/Sub", PubsubIO.readStrings().fromSubscription(subscription))
      .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class)).via(message -> {
        // This is a good place to add error handling.
        // The first transform should act as a validation layer to make sure
        // that any data coming to the processing pipeline must be valid.
        // See `MapElements.MapWithFailures` for more details.
        LOG.info("message: {}", message);
        var msg = GSON.fromJson(message, PageReviewMessage.class);
        return Row.withSchema(schema).addValues(
            msg.url,                                    // row url
            msg.review.equals("positive") ? 1.0 : 0.0,  // row page_score
            new Instant()                               // row processing_time
        ).build();
      })).setRowSchema(schema) // make sure to set the row schema for the PCollection

      // Add timestamps and bundle elements into windows.
      .apply("Add processing time", WithTimestamps.of((row) -> row.getDateTime("processing_time").toInstant()))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      // Apply a SQL query for every window of elements.
      .apply("Run Beam SQL query", SqlTransform.query(
          "SELECT " +
          "  url, " +
          "  COUNT(page_score) AS num_reviews, " +
          "  AVG(page_score) AS score, " +
          "  MIN(processing_time) AS first_date, " +
          "  MAX(processing_time) AS last_date " +
          "FROM PCOLLECTION " +
          "GROUP BY url"
      ))

      // Convert the SQL Rows into BigQuery TableRows and write them to BigQuery.
      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)).via(row -> {
        LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"), row.getString("url"),
            row.getInt64("num_reviews"));
        return new TableRow()
            .set("url", row.getString("url"))
            .set("num_reviews", row.getInt64("num_reviews"))
            .set("score", row.getDouble("score"))
            .set("first_date", row.getDateTime("first_date").toInstant().toString())
            .set("last_date", row.getDateTime("last_date").toInstant().toString());
      }))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              // To learn more about the valid BigQuery types:
              //   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("num_reviews").setType("INTEGER"),
              new TableFieldSchema().setName("score").setType("FLOAT64"),
              new TableFieldSchema().setName("first_date").setType("TIMESTAMP"),
              new TableFieldSchema().setName("last_date").setType("TIMESTAMP"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}
 
Example #28
Source File: KafkaToBigQuery.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var pipeline = Pipeline.create(options);
  pipeline
      .apply("Read messages from Kafka",
          KafkaIO.<String, String>read()
              .withBootstrapServers(options.getBootstrapServer())
              .withTopic(options.getInputTopic())
              .withKeyDeserializer(StringDeserializer.class)
              .withValueDeserializer(StringDeserializer.class)
              .withoutMetadata())
      .apply("Get message contents", Values.<String>create())
      .apply("Log messages", MapElements.into(TypeDescriptor.of(String.class))
          .via(message -> {
            LOG.info("Received: {}", message);
            return message;
          }))
      .apply("Parse JSON", MapElements.into(TypeDescriptor.of(PageRating.class))
          .via(message -> GSON.fromJson(message, PageRating.class)))

      .apply("Add processing time", WithTimestamps.of((pageRating) -> new Instant(pageRating.processingTime)))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class))
          .via(pageRating -> new TableRow()
              .set("processing_time", pageRating.processingTime.toString())
              .set("url", pageRating.url)
              .set("rating", pageRating.rating)))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"),
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("rating").setType("STRING"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}
 
Example #29
Source File: BigQueryReadWriteIT.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testSQLWriteAndRead_WithWriteDispositionAppend() throws IOException {
  bigQueryTestingTypes.insertRows(
      SOURCE_SCHEMA_TWO,
      row(
          SOURCE_SCHEMA_TWO,
          8223372036854775807L,
          (byte) 256,
          (short) 26892,
          1462973245,
          (float) 2.0,
          2.0,
          true,
          parseTimestampWithUTCTimeZone("2018-05-28 20:17:40.123"),
          "varchar",
          "char",
          Arrays.asList("123", "456")));

  BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new BigQueryTableProvider());

  String createTableStatement =
      "CREATE EXTERNAL TABLE TEST( \n"
          + "   c_bigint BIGINT, \n"
          + "   c_tinyint TINYINT, \n"
          + "   c_smallint SMALLINT, \n"
          + "   c_integer INTEGER, \n"
          + "   c_float FLOAT, \n"
          + "   c_double DOUBLE, \n"
          + "   c_boolean BOOLEAN, \n"
          + "   c_timestamp TIMESTAMP, \n"
          + "   c_varchar VARCHAR, \n "
          + "   c_char CHAR, \n"
          + "   c_arr ARRAY<VARCHAR> \n"
          + ") \n"
          + "TYPE 'bigquery' \n"
          + "LOCATION '"
          + bigQueryTestingTypes.tableSpec()
          + "' \n"
          + "TBLPROPERTIES "
          + "'{ "
          + WRITE_DISPOSITION_PROPERTY
          + ": \""
          + WriteDisposition.WRITE_APPEND.toString()
          + "\" }'";
  sqlEnv.executeDdl(createTableStatement);

  String insertStatement =
      "INSERT INTO TEST VALUES ("
          + "9223372036854775807, "
          + "127, "
          + "32767, "
          + "2147483647, "
          + "1.0, "
          + "1.0, "
          + "TRUE, "
          + "TIMESTAMP '2018-05-28 20:17:40.123', "
          + "'varchar', "
          + "'char', "
          + "ARRAY['123', '456']"
          + ")";

  sqlEnv.parseQuery(insertStatement);
  BeamSqlRelUtils.toPCollection(pipeline, sqlEnv.parseQuery(insertStatement));
  pipeline.run().waitUntilFinish(Duration.standardMinutes(5));

  bigQueryTestingTypes
      .assertThatAllRows(SOURCE_SCHEMA_TWO)
      .now(
          containsInAnyOrder(
              row(
                  SOURCE_SCHEMA_TWO,
                  9223372036854775807L,
                  (byte) 127,
                  (short) 32767,
                  2147483647,
                  (float) 1.0,
                  1.0,
                  true,
                  parseTimestampWithUTCTimeZone("2018-05-28 20:17:40.123"),
                  "varchar",
                  "char",
                  Arrays.asList("123", "456")),
              row(
                  SOURCE_SCHEMA_TWO,
                  8223372036854775807L,
                  (byte) 256,
                  (short) 26892,
                  1462973245,
                  (float) 2.0,
                  2.0,
                  true,
                  parseTimestampWithUTCTimeZone("2018-05-28 20:17:40.123"),
                  "varchar",
                  "char",
                  Arrays.asList("123", "456"))));
}
 
Example #30
Source File: BigQueryReadWriteIT.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testSQLWriteAndRead_WithWriteDispositionTruncate() throws IOException {
  bigQueryTestingTypes.insertRows(
      SOURCE_SCHEMA_TWO,
      row(
          SOURCE_SCHEMA_TWO,
          8223372036854775807L,
          (byte) 256,
          (short) 26892,
          1462973245,
          (float) 2.0,
          2.0,
          true,
          parseTimestampWithUTCTimeZone("2018-05-28 20:17:40.123"),
          "varchar",
          "char",
          Arrays.asList("123", "456")));

  BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new BigQueryTableProvider());

  String createTableStatement =
      "CREATE EXTERNAL TABLE TEST( \n"
          + "   c_bigint BIGINT, \n"
          + "   c_tinyint TINYINT, \n"
          + "   c_smallint SMALLINT, \n"
          + "   c_integer INTEGER, \n"
          + "   c_float FLOAT, \n"
          + "   c_double DOUBLE, \n"
          + "   c_boolean BOOLEAN, \n"
          + "   c_timestamp TIMESTAMP, \n"
          + "   c_varchar VARCHAR, \n "
          + "   c_char CHAR, \n"
          + "   c_arr ARRAY<VARCHAR> \n"
          + ") \n"
          + "TYPE 'bigquery' \n"
          + "LOCATION '"
          + bigQueryTestingTypes.tableSpec()
          + "'"
          + "TBLPROPERTIES "
          + "'{ "
          + WRITE_DISPOSITION_PROPERTY
          + ": \""
          + WriteDisposition.WRITE_TRUNCATE.toString()
          + "\" }'";
  sqlEnv.executeDdl(createTableStatement);

  String insertStatement =
      "INSERT INTO TEST VALUES ("
          + "9223372036854775807, "
          + "127, "
          + "32767, "
          + "2147483647, "
          + "1.0, "
          + "1.0, "
          + "TRUE, "
          + "TIMESTAMP '2018-05-28 20:17:40.123', "
          + "'varchar', "
          + "'char', "
          + "ARRAY['123', '456']"
          + ")";

  sqlEnv.parseQuery(insertStatement);
  BeamSqlRelUtils.toPCollection(pipeline, sqlEnv.parseQuery(insertStatement));
  pipeline.run().waitUntilFinish(Duration.standardMinutes(5));

  bigQueryTestingTypes
      .assertThatAllRows(SOURCE_SCHEMA_TWO)
      .now(
          containsInAnyOrder(
              row(
                  SOURCE_SCHEMA_TWO,
                  9223372036854775807L,
                  (byte) 127,
                  (short) 32767,
                  2147483647,
                  (float) 1.0,
                  1.0,
                  true,
                  parseTimestampWithUTCTimeZone("2018-05-28 20:17:40.123"),
                  "varchar",
                  "char",
                  Arrays.asList("123", "456"))));
}