org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition Java Examples

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BigQueryDeadletterSink.java    From feast with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<FailedElement> input) {
  TimePartitioning partition = new TimePartitioning().setType("DAY");
  partition.setField(TIMESTAMP_COLUMN);
  input
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withTimePartitioning(partition)
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(input.getPipeline());
}
 
Example #2
Source File: FakeJobService.java    From beam with Apache License 2.0 6 votes vote down vote up
private boolean validateDispositions(
    Table table, CreateDisposition createDisposition, WriteDisposition writeDisposition)
    throws InterruptedException, IOException {
  if (table == null) {
    if (createDisposition == CreateDisposition.CREATE_NEVER) {
      return false;
    }
  } else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
    datasetService.deleteTable(table.getTableReference());
  } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
    List<TableRow> allRows =
        datasetService.getAllRows(
            table.getTableReference().getProjectId(),
            table.getTableReference().getDatasetId(),
            table.getTableReference().getTableId());
    if (!allRows.isEmpty()) {
      return false;
    }
  }
  return true;
}
 
Example #3
Source File: StreamingInserts.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Constructor. */
private StreamingInserts(
    CreateDisposition createDisposition,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    BigQueryServices bigQueryServices,
    InsertRetryPolicy retryPolicy,
    boolean extendedErrorInfo,
    boolean skipInvalidRows,
    boolean ignoreUnknownValues,
    boolean ignoreInsertIds,
    Coder<ElementT> elementCoder,
    SerializableFunction<ElementT, TableRow> toTableRow,
    String kmsKey) {
  this.createDisposition = createDisposition;
  this.dynamicDestinations = dynamicDestinations;
  this.bigQueryServices = bigQueryServices;
  this.retryPolicy = retryPolicy;
  this.extendedErrorInfo = extendedErrorInfo;
  this.skipInvalidRows = skipInvalidRows;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.ignoreInsertIds = ignoreInsertIds;
  this.elementCoder = elementCoder;
  this.toTableRow = toTableRow;
  this.kmsKey = kmsKey;
}
 
Example #4
Source File: StreamingInserts.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Constructor. */
public StreamingInserts(
    CreateDisposition createDisposition,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    Coder<ElementT> elementCoder,
    SerializableFunction<ElementT, TableRow> toTableRow) {
  this(
      createDisposition,
      dynamicDestinations,
      new BigQueryServicesImpl(),
      InsertRetryPolicy.alwaysRetry(),
      false,
      false,
      false,
      false,
      elementCoder,
      toTableRow,
      null);
}
 
Example #5
Source File: WriteWindowedToBigQuery.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #6
Source File: CreateTables.java    From beam with Apache License 2.0 5 votes vote down vote up
private CreateTables(
    CreateDisposition createDisposition,
    BigQueryServices bqServices,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    String kmsKey) {
  this.createDisposition = createDisposition;
  this.bqServices = bqServices;
  this.dynamicDestinations = dynamicDestinations;
  this.kmsKey = kmsKey;
}
 
Example #7
Source File: WriteTables.java    From beam with Apache License 2.0 5 votes vote down vote up
public WriteTables(
    boolean tempTable,
    BigQueryServices bqServices,
    PCollectionView<String> loadJobIdPrefixView,
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    List<PCollectionView<?>> sideInputs,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    @Nullable ValueProvider<String> loadJobProjectId,
    int maxRetryJobs,
    boolean ignoreUnknownValues,
    String kmsKey,
    String sourceFormat,
    boolean useAvroLogicalTypes,
    Set<SchemaUpdateOption> schemaUpdateOptions) {

  this.tempTable = tempTable;
  this.bqServices = bqServices;
  this.loadJobIdPrefixView = loadJobIdPrefixView;
  this.firstPaneWriteDisposition = writeDisposition;
  this.firstPaneCreateDisposition = createDisposition;
  this.sideInputs = sideInputs;
  this.dynamicDestinations = dynamicDestinations;
  this.mainOutputTag = new TupleTag<>("WriteTablesMainOutput");
  this.temporaryFilesTag = new TupleTag<>("TemporaryFiles");
  this.loadJobProjectId = loadJobProjectId;
  this.maxRetryJobs = maxRetryJobs;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.kmsKey = kmsKey;
  this.sourceFormat = sourceFormat;
  this.useAvroLogicalTypes = useAvroLogicalTypes;
  this.schemaUpdateOptions = schemaUpdateOptions;
}
 
Example #8
Source File: BatchLoads.java    From beam with Apache License 2.0 5 votes vote down vote up
BatchLoads(
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    boolean singletonTable,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    Coder<DestinationT> destinationCoder,
    ValueProvider<String> customGcsTempLocation,
    @Nullable ValueProvider<String> loadJobProjectId,
    boolean ignoreUnknownValues,
    Coder<ElementT> elementCoder,
    RowWriterFactory<ElementT, DestinationT> rowWriterFactory,
    @Nullable String kmsKey,
    boolean clusteringEnabled,
    boolean useAvroLogicalTypes) {
  bigQueryServices = new BigQueryServicesImpl();
  this.writeDisposition = writeDisposition;
  this.createDisposition = createDisposition;
  this.singletonTable = singletonTable;
  this.dynamicDestinations = dynamicDestinations;
  this.destinationCoder = destinationCoder;
  this.maxNumWritersPerBundle = DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE;
  this.maxFileSize = DEFAULT_MAX_FILE_SIZE;
  this.numFileShards = DEFAULT_NUM_FILE_SHARDS;
  this.maxFilesPerPartition = DEFAULT_MAX_FILES_PER_PARTITION;
  this.maxBytesPerPartition = DEFAULT_MAX_BYTES_PER_PARTITION;
  this.triggeringFrequency = null;
  this.customGcsTempLocation = customGcsTempLocation;
  this.loadJobProjectId = loadJobProjectId;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.useAvroLogicalTypes = useAvroLogicalTypes;
  this.elementCoder = elementCoder;
  this.kmsKey = kmsKey;
  this.rowWriterFactory = rowWriterFactory;
  this.clusteringEnabled = clusteringEnabled;
  schemaUpdateOptions = Collections.emptySet();
}
 
Example #9
Source File: WriteRename.java    From beam with Apache License 2.0 5 votes vote down vote up
private PendingJobData startWriteRename(
    TableDestination finalTableDestination, Iterable<String> tempTableNames, ProcessContext c)
    throws Exception {
  WriteDisposition writeDisposition =
      (c.pane().getIndex() == 0) ? firstPaneWriteDisposition : WriteDisposition.WRITE_APPEND;
  CreateDisposition createDisposition =
      (c.pane().getIndex() == 0) ? firstPaneCreateDisposition : CreateDisposition.CREATE_NEVER;
  List<TableReference> tempTables =
      StreamSupport.stream(tempTableNames.spliterator(), false)
          .map(table -> BigQueryHelpers.fromJsonString(table, TableReference.class))
          .collect(Collectors.toList());
  ;

  // Make sure each destination table gets a unique job id.
  String jobIdPrefix =
      BigQueryHelpers.createJobId(
          c.sideInput(jobIdToken), finalTableDestination, -1, c.pane().getIndex());

  BigQueryHelpers.PendingJob retryJob =
      startCopy(
          bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
          bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
          jobIdPrefix,
          finalTableDestination.getTableReference(),
          tempTables,
          writeDisposition,
          createDisposition,
          kmsKey);
  return new PendingJobData(retryJob, finalTableDestination, tempTables);
}
 
Example #10
Source File: WriteRename.java    From beam with Apache License 2.0 5 votes vote down vote up
public WriteRename(
    BigQueryServices bqServices,
    PCollectionView<String> jobIdToken,
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    int maxRetryJobs,
    String kmsKey) {
  this.bqServices = bqServices;
  this.jobIdToken = jobIdToken;
  this.firstPaneWriteDisposition = writeDisposition;
  this.firstPaneCreateDisposition = createDisposition;
  this.maxRetryJobs = maxRetryJobs;
  this.kmsKey = kmsKey;
}
 
Example #11
Source File: WriteFailedElementToBigQuery.java    From feast with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailedElement> failedElements) {
  return failedElements
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #12
Source File: WriteToBigQuery.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #13
Source File: DatastoreToBigQuery.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF that returns JSON that conforms to the BigQuery TableRow spec and writes
 * the TableRows to BigQuery.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToBigQueryOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToBigQueryOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(
          ReadJsonEntities.newBuilder()
              .setGqlQuery(options.getDatastoreReadGqlQuery())
              .setProjectId(options.getDatastoreReadProjectId())
              .setNamespace(options.getDatastoreReadNamespace())
              .build())
      .apply(
          TransformTextViaJavascript.newBuilder()
              .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setFunctionName(options.getJavascriptTextTransformFunctionName())
              .build())
      .apply(BigQueryConverters.jsonToTableRow())
      .apply(
          "WriteBigQuery",
          BigQueryIO.writeTableRows()
              .withoutValidation()
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER)
              .to(options.getOutputTableSpec())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)
              .withCustomGcsTempLocation(options.getBigQueryLoadingTemporaryDirectory()));

  pipeline.run();
}
 
Example #14
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #15
Source File: WriteToBigQuery.java    From deployment-examples with MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #16
Source File: WriteWindowedToBigQuery.java    From deployment-examples with MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #17
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #18
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #19
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #20
Source File: KafkaToBigQuery.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #21
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #22
Source File: CreateTables.java    From beam with Apache License 2.0 4 votes vote down vote up
private TableDestination getTableDestination(ProcessContext context, DestinationT destination) {
  TableDestination tableDestination = dynamicDestinations.getTable(destination);
  checkArgument(
      tableDestination != null,
      "DynamicDestinations.getTable() may not return null, "
          + "but %s returned null for destination %s",
      dynamicDestinations,
      destination);
  checkArgument(
      tableDestination.getTableSpec() != null,
      "DynamicDestinations.getTable() must return a TableDestination "
          + "with a non-null table spec, but %s returned %s for destination %s,"
          + "which has a null table spec",
      dynamicDestinations,
      tableDestination,
      destination);
  boolean destinationCoderSupportsClustering =
      !(dynamicDestinations.getDestinationCoder() instanceof TableDestinationCoderV2);
  checkArgument(
      tableDestination.getClustering() == null || destinationCoderSupportsClustering,
      "DynamicDestinations.getTable() may only return destinations with clustering configured"
          + " if a destination coder is supplied that supports clustering, but %s is configured"
          + " to use TableDestinationCoderV2. Set withClustering() on BigQueryIO.write() and, "
          + " if you provided a custom DynamicDestinations instance, override"
          + " getDestinationCoder() to return TableDestinationCoderV3.",
      dynamicDestinations);
  TableReference tableReference = tableDestination.getTableReference().clone();
  if (Strings.isNullOrEmpty(tableReference.getProjectId())) {
    tableReference.setProjectId(
        context.getPipelineOptions().as(BigQueryOptions.class).getProject());
    tableDestination = tableDestination.withTableReference(tableReference);
  }
  if (createDisposition == CreateDisposition.CREATE_NEVER) {
    return tableDestination;
  }

  String tableSpec = BigQueryHelpers.stripPartitionDecorator(tableDestination.getTableSpec());
  if (!createdTables.contains(tableSpec)) {
    // Another thread may have succeeded in creating the table in the meanwhile, so
    // check again. This check isn't needed for correctness, but we add it to prevent
    // every thread from attempting a create and overwhelming our BigQuery quota.
    synchronized (createdTables) {
      if (!createdTables.contains(tableSpec)) {
        tryCreateTable(context, destination, tableDestination, tableSpec, kmsKey);
      }
    }
  }
  return tableDestination;
}
 
Example #23
Source File: KafkaToBigQuery.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var pipeline = Pipeline.create(options);
  pipeline
      .apply("Read messages from Kafka",
          KafkaIO.<String, String>read()
              .withBootstrapServers(options.getBootstrapServer())
              .withTopic(options.getInputTopic())
              .withKeyDeserializer(StringDeserializer.class)
              .withValueDeserializer(StringDeserializer.class)
              .withoutMetadata())
      .apply("Get message contents", Values.<String>create())
      .apply("Log messages", MapElements.into(TypeDescriptor.of(String.class))
          .via(message -> {
            LOG.info("Received: {}", message);
            return message;
          }))
      .apply("Parse JSON", MapElements.into(TypeDescriptor.of(PageRating.class))
          .via(message -> GSON.fromJson(message, PageRating.class)))

      .apply("Add processing time", WithTimestamps.of((pageRating) -> new Instant(pageRating.processingTime)))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class))
          .via(pageRating -> new TableRow()
              .set("processing_time", pageRating.processingTime.toString())
              .set("url", pageRating.url)
              .set("rating", pageRating.rating)))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"),
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("rating").setType("STRING"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}
 
Example #24
Source File: StreamingBeamSQL.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var project = options.as(GcpOptions.class).getProject();
  var subscription = ProjectSubscriptionName.of(project, options.getInputSubscription()).toString();

  var schema = Schema.builder()
      .addStringField("url")
      .addDoubleField("page_score")
      .addDateTimeField("processing_time")
      .build();

  var pipeline = Pipeline.create(options);
  pipeline
      // Read, parse, and validate messages from Pub/Sub.
      .apply("Read messages from Pub/Sub", PubsubIO.readStrings().fromSubscription(subscription))
      .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class)).via(message -> {
        // This is a good place to add error handling.
        // The first transform should act as a validation layer to make sure
        // that any data coming to the processing pipeline must be valid.
        // See `MapElements.MapWithFailures` for more details.
        LOG.info("message: {}", message);
        var msg = GSON.fromJson(message, PageReviewMessage.class);
        return Row.withSchema(schema).addValues(
            msg.url,                                    // row url
            msg.review.equals("positive") ? 1.0 : 0.0,  // row page_score
            new Instant()                               // row processing_time
        ).build();
      })).setRowSchema(schema) // make sure to set the row schema for the PCollection

      // Add timestamps and bundle elements into windows.
      .apply("Add processing time", WithTimestamps.of((row) -> row.getDateTime("processing_time").toInstant()))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      // Apply a SQL query for every window of elements.
      .apply("Run Beam SQL query", SqlTransform.query(
          "SELECT " +
          "  url, " +
          "  COUNT(page_score) AS num_reviews, " +
          "  AVG(page_score) AS score, " +
          "  MIN(processing_time) AS first_date, " +
          "  MAX(processing_time) AS last_date " +
          "FROM PCOLLECTION " +
          "GROUP BY url"
      ))

      // Convert the SQL Rows into BigQuery TableRows and write them to BigQuery.
      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)).via(row -> {
        LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"), row.getString("url"),
            row.getInt64("num_reviews"));
        return new TableRow()
            .set("url", row.getString("url"))
            .set("num_reviews", row.getInt64("num_reviews"))
            .set("score", row.getDouble("score"))
            .set("first_date", row.getDateTime("first_date").toInstant().toString())
            .set("last_date", row.getDateTime("last_date").toInstant().toString());
      }))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              // To learn more about the valid BigQuery types:
              //   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("num_reviews").setType("INTEGER"),
              new TableFieldSchema().setName("score").setType("FLOAT64"),
              new TableFieldSchema().setName("first_date").setType("TIMESTAMP"),
              new TableFieldSchema().setName("last_date").setType("TIMESTAMP"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}
 
Example #25
Source File: FakeJobService.java    From beam with Apache License 2.0 4 votes vote down vote up
private JobStatus runCopyJob(JobConfigurationTableCopy copy)
    throws InterruptedException, IOException {
  List<TableReference> sources = copy.getSourceTables();
  TableReference destination = copy.getDestinationTable();
  WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition());
  CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition());
  Table existingTable = datasetService.getTable(destination);
  if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
    return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
  }
  TimePartitioning partitioning = null;
  Clustering clustering = null;
  TableSchema schema = null;
  boolean first = true;
  List<TableRow> allRows = Lists.newArrayList();
  for (TableReference source : sources) {
    Table table = checkNotNull(datasetService.getTable(source));
    if (!first) {
      if (!Objects.equals(partitioning, table.getTimePartitioning())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
      if (!Objects.equals(clustering, table.getClustering())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
      if (!Objects.equals(schema, table.getSchema())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
    }
    partitioning = table.getTimePartitioning();
    clustering = table.getClustering();
    schema = table.getSchema();
    first = false;
    allRows.addAll(
        datasetService.getAllRows(
            source.getProjectId(), source.getDatasetId(), source.getTableId()));
  }
  datasetService.createTable(
      new Table()
          .setTableReference(destination)
          .setSchema(schema)
          .setTimePartitioning(partitioning)
          .setClustering(clustering)
          .setEncryptionConfiguration(copy.getDestinationEncryptionConfiguration()));
  datasetService.insertAll(destination, allRows, null);
  return new JobStatus().setState("DONE");
}
 
Example #26
Source File: FakeJobService.java    From beam with Apache License 2.0 4 votes vote down vote up
private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load)
    throws InterruptedException, IOException {
  TableReference destination = load.getDestinationTable();
  TableSchema schema = load.getSchema();
  checkArgument(schema != null, "No schema specified");
  List<ResourceId> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId());
  WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition());
  CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition());

  Table existingTable = datasetService.getTable(destination);
  if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
    return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
  }
  if (existingTable == null) {
    TableReference strippedDestination =
        destination
            .clone()
            .setTableId(BigQueryHelpers.stripPartitionDecorator(destination.getTableId()));
    existingTable = new Table().setTableReference(strippedDestination).setSchema(schema);
    if (load.getTimePartitioning() != null) {
      existingTable = existingTable.setTimePartitioning(load.getTimePartitioning());
    }
    if (load.getClustering() != null) {
      existingTable = existingTable.setClustering(load.getClustering());
    }
    datasetService.createTable(existingTable);
  }

  List<TableRow> rows = Lists.newArrayList();
  for (ResourceId filename : sourceFiles) {
    if (load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON")) {
      rows.addAll(readJsonTableRows(filename.toString()));
    } else if (load.getSourceFormat().equals("AVRO")) {
      rows.addAll(readAvroTableRows(filename.toString(), schema));
    }
  }

  datasetService.insertAll(destination, rows, null);
  FileSystems.delete(sourceFiles);
  return new JobStatus().setState("DONE");
}
 
Example #27
Source File: OpinionAnalysisPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 4 votes vote down vote up
/**
 * This function creates the DAG graph of transforms. It can be called from main()
 * as well as from the ControlPipeline.
 * @param options
 * @return
 * @throws Exception
 */
public static Pipeline createNLPPipeline(IndexerPipelineOptions options) throws Exception {
	
    IndexerPipelineUtils.validateIndexerPipelineOptions(options);
	Pipeline pipeline = Pipeline.create(options);
	
	PCollection<InputContent> readContent;
	PCollection<String> rawInput;
	
	if (options.isStreaming()) {
		
		// Continuously read from a Pub/Sub topic
		rawInput = pipeline.apply("Read from PubSub", 
			PubsubIO.readStrings().fromTopic(
				options.getPubsubTopic())); 
		
	
	} else {
		// Read from GCS files

		rawInput = pipeline.apply("Read from GCS files", 
			Read.from(new RecordFileSource<String>(
				ValueProvider.StaticValueProvider.of(options.getInputFile()), 
				StringUtf8Coder.of(), 
				RecordFileSource.DEFAULT_RECORD_SEPARATOR)));
	}

	readContent = rawInput.apply(ParDo.of(new ParseRawInput()));
	
	// Extract opinions from online opinions
	PCollection<ContentIndexSummary> indexes = readContent
		.apply(ParDo.of(new IndexDocument())) 
		.setCoder(AvroCoder.of(ContentIndexSummary.class));
	

	// Write into BigQuery 
	PCollectionTuple bqrows= indexes
		.apply(ParDo.of(new CreateTableRowsFromIndexSummaryFn())
			.withOutputTags(webresourceTag, // main output collection
				TupleTagList.of(documentTag).and(sentimentTag)) // 2 side output collections
			); 
	
	PCollection<TableRow> webresourceRows = bqrows.get(webresourceTag);
	PCollection<TableRow> documentRows = bqrows.get(documentTag);
	PCollection<TableRow> sentimentRows = bqrows.get(sentimentTag);

	// Append or Overwrite
	WriteDisposition dispo = options.getWriteTruncate() ? 
			WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
	
		
	webresourceRows
		.apply("Write to webresource", 
			BigQueryIO.writeTableRows()
				.to(getWebResourceTableReference(options)) 
				.withSchema(getWebResourceSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	documentRows
		.apply("Write to document", 
			BigQueryIO.writeTableRows()
				.to(getDocumentTableReference(options))
				.withSchema(getDocumentTableSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	sentimentRows
		.apply("Write to sentiment", 
			BigQueryIO.writeTableRows()
				.to(getSentimentTableReference(options)) 
				.withSchema(getSentimentSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo));

	
	return pipeline;
}
 
Example #28
Source File: PubsubAvroToBigQuery.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the pipeline with the supplied options.
 *
 * @param options execution parameters to the pipeline
 * @return result of the pipeline execution as a {@link PipelineResult}
 */
private static PipelineResult run(PubsubAvroToBigQueryOptions options) {

  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  Schema schema = SchemaUtils.getAvroSchema(options.getSchemaPath());

  WriteResult writeResults =
      pipeline
          .apply(
              "Read Avro records",
              PubsubIO
                  .readAvroGenericRecords(schema)
                  .fromSubscription(options.getInputSubscription()))

          .apply(
              "Write to BigQuery",
              BigQueryIO.<GenericRecord>write()
                  .to(options.getOutputTableSpec())
                  .useBeamSchema()
                  .withMethod(Method.STREAMING_INSERTS)
                  .withWriteDisposition(WriteDisposition.valueOf(options.getWriteDisposition()))
                  .withCreateDisposition(
                      CreateDisposition.valueOf(options.getCreateDisposition()))
                  .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
                  .withExtendedErrorInfo());

  writeResults
      .getFailedInsertsWithErr()
      .apply(
          "Create error payload",
          ErrorConverters.BigQueryInsertErrorToPubsubMessage.<GenericRecord>newBuilder()
              .setPayloadCoder(AvroCoder.of(schema))
              .setTranslateFunction(
                  BigQueryConverters.TableRowToGenericRecordFn.of(schema))
              .build())
      .apply(
          "Write failed records",
          PubsubIO.writeMessages().to(options.getOutputTopic()));

  // Execute the pipeline and return the result.
  return pipeline.run();
}
 
Example #29
Source File: CreateTables.java    From beam with Apache License 2.0 4 votes vote down vote up
private void tryCreateTable(
    ProcessContext context,
    DestinationT destination,
    TableDestination tableDestination,
    String tableSpec,
    String kmsKey) {
  DatasetService datasetService =
      bqServices.getDatasetService(context.getPipelineOptions().as(BigQueryOptions.class));
  TableReference tableReference = tableDestination.getTableReference().clone();
  tableReference.setTableId(
      BigQueryHelpers.stripPartitionDecorator(tableReference.getTableId()));
  try {
    if (datasetService.getTable(tableReference) == null) {
      TableSchema tableSchema = dynamicDestinations.getSchema(destination);
      checkArgument(
          tableSchema != null,
          "Unless create disposition is %s, a schema must be specified, i.e. "
              + "DynamicDestinations.getSchema() may not return null. "
              + "However, create disposition is %s, and "
              + " %s returned null for destination %s",
          CreateDisposition.CREATE_NEVER,
          createDisposition,
          dynamicDestinations,
          destination);
      Table table =
          new Table()
              .setTableReference(tableReference)
              .setSchema(tableSchema)
              .setDescription(tableDestination.getTableDescription());
      if (tableDestination.getTimePartitioning() != null) {
        table.setTimePartitioning(tableDestination.getTimePartitioning());
        if (tableDestination.getClustering() != null) {
          table.setClustering(tableDestination.getClustering());
        }
      }
      if (kmsKey != null) {
        table.setEncryptionConfiguration(new EncryptionConfiguration().setKmsKeyName(kmsKey));
      }
      datasetService.createTable(table);
    }
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
  createdTables.add(tableSpec);
}
 
Example #30
Source File: Snippets.java    From beam with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {

      // [START BigQueryIODeadLetter]

      PipelineOptions options =
          PipelineOptionsFactory.fromArgs(args).withValidation().as(BigQueryOptions.class);

      Pipeline p = Pipeline.create(options);

      // Create a bug by writing the 2nd value as null. The API will correctly
      // throw an error when trying to insert a null value into a REQUIRED field.
      WriteResult result =
          p.apply(Create.of(1, 2))
              .apply(
                  BigQueryIO.<Integer>write()
                      .withSchema(
                          new TableSchema()
                              .setFields(
                                  ImmutableList.of(
                                      new TableFieldSchema()
                                          .setName("num")
                                          .setType("INTEGER")
                                          .setMode("REQUIRED"))))
                      .to("Test.dummyTable")
                      .withFormatFunction(x -> new TableRow().set("num", (x == 2) ? null : x))
                      .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
                      // Forcing the bounded pipeline to use streaming inserts
                      .withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS)
                      // set the withExtendedErrorInfo property.
                      .withExtendedErrorInfo()
                      .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                      .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));

      result
          .getFailedInsertsWithErr()
          .apply(
              MapElements.into(TypeDescriptors.strings())
                  .via(
                      x -> {
                        System.out.println(" The table was " + x.getTable());
                        System.out.println(" The row was " + x.getRow());
                        System.out.println(" The error was " + x.getError());
                        return "";
                      }));
      p.run();

      /*  Sample Output From the pipeline:
       <p>The table was GenericData{classInfo=[datasetId, projectId, tableId], {datasetId=Test,projectId=<>, tableId=dummyTable}}
       <p>The row was GenericData{classInfo=[f], {num=null}}
       <p>The error was GenericData{classInfo=[errors, index],{errors=[GenericData{classInfo=[debugInfo, location, message, reason], {debugInfo=,location=, message=Missing required field: Msg_0_CLOUD_QUERY_TABLE.num., reason=invalid}}],index=0}}
      */
    }