org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO Java Examples

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param options
 * @param pipeline
 * @param readContent
 * @return
 */
private static PCollection<InputContent> filterAlreadyProcessedUrls(
		PCollection<InputContent> readContent, Pipeline pipeline, 
		IndexerPipelineOptions options) {
	PCollection<InputContent> contentToProcess;
	String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
	PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
		.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
		.apply(ParDo.of(new GetUrlFn()));

	final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
		alreadyProcessedUrls.apply(View.<String,Long>asMap());
	  
	contentToProcess = readContent
		.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
			.withSideInputs(alreadyProcessedUrlsSideInput));
	return contentToProcess;
}
 
Example #2
Source File: FilterExamples.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);

    TableSchema schema = buildWeatherSchemaProjection();

    p.apply(BigQueryIO.readTableRows().from(options.getInput()))
        .apply(ParDo.of(new ProjectionFn()))
        .apply(new BelowGlobalMean(options.getMonthFilter()))
        .apply(
            BigQueryIO.writeTableRows()
                .to(options.getOutput())
                .withSchema(schema)
                .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

    p.run().waitUntilFinish();
  }
 
Example #3
Source File: BigQueryConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin begin) {
  return begin
      .apply(
          "AvroToEntity",
          BigQueryIO.read(
                  AvroToEntity.newBuilder()
                      .setEntityKind(entityKind())
                      .setUniqueNameColumn(uniqueNameColumn())
                      .setNamespace(namespace())
                      .build())
              .fromQuery(query())
              .withoutValidation()
              .withTemplateCompatibility()
              .usingStandardSql())
      .apply(
          "CheckNoKey",
          CheckNoKey.newBuilder()
              .setFailureTag(failureTag())
              .setSuccessTag(successTag())
              .build());
}
 
Example #4
Source File: CombinePerKeyExamples.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);

    // Build the table schema for the output table.
    List<TableFieldSchema> fields = new ArrayList<>();
    fields.add(new TableFieldSchema().setName("word").setType("STRING"));
    fields.add(new TableFieldSchema().setName("all_plays").setType("STRING"));
    TableSchema schema = new TableSchema().setFields(fields);

    p.apply(BigQueryIO.readTableRows().from(options.getInput()))
        .apply(new PlaysForWord())
        .apply(
            BigQueryIO.writeTableRows()
                .to(options.getOutput())
                .withSchema(schema)
                .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

    p.run().waitUntilFinish();
  }
 
Example #5
Source File: MaxPerKeyExamples.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);

    // Build the table schema for the output table.
    List<TableFieldSchema> fields = new ArrayList<>();
    fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
    fields.add(new TableFieldSchema().setName("max_mean_temp").setType("FLOAT"));
    TableSchema schema = new TableSchema().setFields(fields);

    p.apply(BigQueryIO.readTableRows().from(options.getInput()))
        .apply(new MaxMeanTemp())
        .apply(
            BigQueryIO.writeTableRows()
                .to(options.getOutput())
                .withSchema(schema)
                .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

    p.run().waitUntilFinish();
  }
 
Example #6
Source File: BigQueryIOIT.java    From beam with Apache License 2.0 6 votes vote down vote up
private void testWrite(BigQueryIO.Write<byte[]> writeIO, String metricName) {
  Pipeline pipeline = Pipeline.create(options);

  BigQueryIO.Write.Method method = BigQueryIO.Write.Method.valueOf(options.getWriteMethod());
  pipeline
      .apply("Read from source", Read.from(new SyntheticBoundedSource(sourceOptions)))
      .apply("Gather time", ParDo.of(new TimeMonitor<>(NAMESPACE, metricName)))
      .apply("Map records", ParDo.of(new MapKVToV()))
      .apply(
          "Write to BQ",
          writeIO
              .to(tableQualifier)
              .withCustomGcsTempLocation(ValueProvider.StaticValueProvider.of(tempRoot))
              .withMethod(method)
              .withSchema(
                  new TableSchema()
                      .setFields(
                          Collections.singletonList(
                              new TableFieldSchema().setName("data").setType("BYTES")))));

  PipelineResult pipelineResult = pipeline.run();
  pipelineResult.waitUntilFinish();
  extractAndPublishTime(pipelineResult, metricName);
}
 
Example #7
Source File: BigQueryDeadletterSink.java    From feast with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<FailedElement> input) {
  TimePartitioning partition = new TimePartitioning().setType("DAY");
  partition.setField(TIMESTAMP_COLUMN);
  input
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withTimePartitioning(partition)
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(input.getPipeline());
}
 
Example #8
Source File: BigQueryLoader.java    From quetzal with Eclipse Public License 2.0 6 votes vote down vote up
public static void write(String table, TableSchema schema, PCollection<JSONObject> data) {
	data.apply("convert to TableRow", ParDo.of(new DoFn<JSONObject,TableRow>() {
		private static final long serialVersionUID = -4204128594221801617L;
		@SuppressWarnings("unchecked")
		@ProcessElement
		public void processElement(ProcessContext c) {
			JSONObject obj = c.element();
			TableRow x = new TableRow();
			obj.keySet().forEach((Object key) -> {
				x.set((String) key, obj.get(key));
			});
			c.output(x);
		}
	})).apply(BigQueryIO.Write
			.withTableDescription(table)
			.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)
			.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
			.withSchema(schema)
			.to(table));
}
 
Example #9
Source File: LogAnalyticsPipeline.java    From processing-logs-using-dataflow with Apache License 2.0 6 votes vote down vote up
public PCollection<TableRow> expand(PCollection<KV<String,Double>> input) {
    PCollection<TableRow> output = input.
      apply( "aggregateToTableRow", ParDo.of(new DoFn<KV<String, Double>, TableRow>() {
          @ProcessElement
          public void processElement(ProcessContext c) {
              KV<String, Double> e = c.element();

              TableRow row = new TableRow()
                .set("destination", e.getKey())
                .set("aggResponseTime", e.getValue());

              c.output(row);
          }
      }));

    output.apply("tableRowToBigQuery", BigQueryIO.writeTableRows()
      .to(this.tableName)
      .withSchema(createTableSchema(this.tableSchema))
      .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
      .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

    return output;
}
 
Example #10
Source File: BigQueryOutputRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
private BigQueryIO.Write setWriteOperation(BigQueryIO.Write bigQueryIOPTransform) {
    if (properties.tableOperation.getValue() == BigQueryOutputProperties.TableOperation.NONE
            || properties.tableOperation
                    .getValue() == BigQueryOutputProperties.TableOperation.CREATE_IF_NOT_EXISTS) {
        switch (properties.writeOperation.getValue()) {
        case APPEND:
            bigQueryIOPTransform =
                    bigQueryIOPTransform.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND);
            break;
        case WRITE_TO_EMPTY:
            bigQueryIOPTransform =
                    bigQueryIOPTransform.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_EMPTY);
            break;
        default:
            throw new RuntimeException("To be implemented: " + properties.writeOperation.getValue());
        }
    } else {
        if (properties.writeOperation.getValue() != null) {
            LOG.info("Write operation " + properties.writeOperation.getValue()
                    + " be ignored when Table operation is " + properties.tableOperation.getValue());
        }
    }
    return bigQueryIOPTransform;
}
 
Example #11
Source File: BigQueryBigtableTransfer.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
/**
 * <p>Creates a dataflow pipeline that creates the following chain:</p>
 * <ol>
 *   <li> Gets the records into the Pipeline
 *   <li> Creates Puts from each of the records
 *   <li> Performs a Bigtable Put on the records
 * </ol>
 *
 * @param args Arguments to use to configure the Dataflow Pipeline.  The first three are required
 *   when running via managed resource in Google Cloud Platform.  Those options should be omitted
 *   for LOCAL runs.  The last four arguments are to configure the Bigtable connection.
 *        --runner=BlockingDataflowPipelineRunner
 *        --project=[dataflow project] \\
 *        --stagingLocation=gs://[your google storage bucket] \\
 *        --bigtableProject=[bigtable project] \\
 *        --bigtableInstanceId=[bigtable instance id] \\
 *        --bigtableTableId=[bigtable tableName]
 *
 * <p>Note:The Hbase-Bigtable client currently supports upto 100K columns in a single {@link Put}.
 *       If your data is exceeding 100K columns, please create multiple {@link Put} objects.
 */

public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  BigQueryBigtableTransferOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigQueryBigtableTransferOptions.class);

  // CloudBigtableTableConfiguration contains the project, instance and table to connect to.
  CloudBigtableTableConfiguration config =
      new CloudBigtableTableConfiguration.Builder()
      .withProjectId(options.getBigtableProjectId())
      .withInstanceId(options.getBigtableInstanceId())
      .withTableId(options.getBigtableTableId())
      .build();

  Pipeline p = Pipeline.create(options);

  p
      .apply(BigQueryIO.read().from("ReadSourceTable").fromQuery(options.getBqQuery())
          .usingStandardSql())
      .apply(ParDo.of(MUTATION_TRANSFORM))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run().waitUntilFinish();

}
 
Example #12
Source File: BigQueryRowCountIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/** This tests if the pipeline options are injected in the path of SQL Transform. */
@Test
public void testPipelineOptionInjection() {
  BigQueryTestTableProvider provider = new BigQueryTestTableProvider();
  Table table = getTable("testTable", bigQuery.tableSpec());
  provider.addTable("testTable", table);

  pipeline
      .apply(
          Create.of(
                  new TableRow().set("id", 1).set("name", "name1"),
                  new TableRow().set("id", 2).set("name", "name2"),
                  new TableRow().set("id", 3).set("name", "name3"))
              .withCoder(TableRowJsonCoder.of()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(bigQuery.tableSpec())
              .withSchema(
                  new TableSchema()
                      .setFields(
                          ImmutableList.of(
                              new TableFieldSchema().setName("id").setType("INTEGER"),
                              new TableFieldSchema().setName("name").setType("STRING"))))
              .withoutValidation());
  pipeline.run().waitUntilFinish();

  // changing pipeline options
  readingPipeline.getOptions().setJobName(FAKE_JOB_NAME);

  // Reading from the table should update the statistics of bigQuery table
  readingPipeline.apply(
      SqlTransform.query(" select * from testTable ")
          .withDefaultTableProvider("bigquery", provider));

  readingPipeline.run().waitUntilFinish();

  BigQueryTestTable sqlTable = (BigQueryTestTable) provider.buildBeamSqlTable(table);
  assertEquals(FAKE_JOB_NAME, sqlTable.getJobName());
}
 
Example #13
Source File: DataCatalogBigQueryIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testRead() throws Exception {
  TableReference bqTable = bigQuery.tableReference();

  // Streaming inserts do not work with DIRECT_READ mode, there is a several hour lag.
  PCollection<Row> data =
      writePipeline.apply(Create.of(row(1, "name1"), row(2, "name2"), row(3, "name3")));
  data.apply(
      BigQueryIO.<Row>write()
          .withSchema(BigQueryUtils.toTableSchema(ID_NAME_SCHEMA))
          .withFormatFunction(BigQueryUtils.toTableRow())
          .withMethod(Method.FILE_LOADS)
          .to(bqTable));
  writePipeline.run().waitUntilFinish(Duration.standardMinutes(2));

  String tableId =
      String.format(
          "bigquery.`table`.`%s`.`%s`.`%s`",
          bqTable.getProjectId(), bqTable.getDatasetId(), bqTable.getTableId());

  readPipeline
      .getOptions()
      .as(BeamSqlPipelineOptions.class)
      .setPlannerName(queryPlanner.getCanonicalName());

  try (DataCatalogTableProvider tableProvider =
      DataCatalogTableProvider.create(
          readPipeline.getOptions().as(DataCatalogPipelineOptions.class))) {
    PCollection<Row> result =
        readPipeline.apply(
            "query",
            SqlTransform.query("SELECT id, name FROM " + tableId)
                .withDefaultTableProvider("datacatalog", tableProvider));

    PAssert.that(result).containsInAnyOrder(row(1, "name1"), row(2, "name2"), row(3, "name3"));
    readPipeline.run().waitUntilFinish(Duration.standardMinutes(2));
  }
}
 
Example #14
Source File: InvoicingPipeline.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/** Deploys the invoicing pipeline as a template on GCS, for a given projectID and GCS bucket. */
public void deploy() {
  // We can't store options as a member variable due to serialization concerns.
  InvoicingPipelineOptions options = PipelineOptionsFactory.as(InvoicingPipelineOptions.class);
  options.setProject(projectId);
  options.setRunner(DataflowRunner.class);
  // This causes p.run() to stage the pipeline as a template on GCS, as opposed to running it.
  options.setTemplateLocation(invoiceTemplateUrl);
  options.setStagingLocation(beamStagingUrl);
  // This credential is used when Dataflow deploys the template to GCS in target GCP project.
  // So, make sure the credential has write permission to GCS in that project.
  options.setGcpCredential(googleCredentials);

  Pipeline p = Pipeline.create(options);

  PCollection<BillingEvent> billingEvents =
      p.apply(
          "Read BillingEvents from Bigquery",
          BigQueryIO.read(BillingEvent::parseFromRecord)
              .fromQuery(InvoicingUtils.makeQueryProvider(options.getYearMonth(), projectId))
              .withCoder(SerializableCoder.of(BillingEvent.class))
              .usingStandardSql()
              .withoutValidation()
              .withTemplateCompatibility());
  applyTerminalTransforms(billingEvents, options.getYearMonth());
  p.run();
}
 
Example #15
Source File: BigQueryIOIT.java    From beam with Apache License 2.0 5 votes vote down vote up
private void testRead() {
  Pipeline pipeline = Pipeline.create(options);
  pipeline
      .apply("Read from BQ", BigQueryIO.readTableRows().from(tableQualifier))
      .apply("Gather time", ParDo.of(new TimeMonitor<>(NAMESPACE, READ_TIME_METRIC_NAME)));
  PipelineResult result = pipeline.run();
  result.waitUntilFinish();
  extractAndPublishTime(result, READ_TIME_METRIC_NAME);
}
 
Example #16
Source File: BigQueryIOIT.java    From beam with Apache License 2.0 5 votes vote down vote up
private void testAvroWrite() {
  BigQueryIO.Write<byte[]> writeIO =
      BigQueryIO.<byte[]>write()
          .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)
          .withAvroFormatFunction(
              writeRequest -> {
                byte[] data = writeRequest.getElement();
                GenericRecord record = new GenericData.Record(writeRequest.getSchema());
                record.put("data", ByteBuffer.wrap(data));
                return record;
              });
  testWrite(writeIO, AVRO_WRITE_TIME_METRIC_NAME);
}
 
Example #17
Source File: BigQueryIOIT.java    From beam with Apache License 2.0 5 votes vote down vote up
private void testJsonWrite() {
  BigQueryIO.Write<byte[]> writeIO =
      BigQueryIO.<byte[]>write()
          .withFormatFunction(
              input -> {
                TableRow tableRow = new TableRow();
                tableRow.set("data", input);
                return tableRow;
              });
  testWrite(writeIO, WRITE_TIME_METRIC_NAME);
}
 
Example #18
Source File: NexmarkLauncher.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Send {@code formattedResults} to BigQuery. */
private void sinkResultsToBigQuery(
    PCollection<String> formattedResults, long now, String version) {
  String tableSpec = NexmarkUtils.tableSpec(options, queryName, now, version);
  TableSchema tableSchema =
      new TableSchema()
          .setFields(
              ImmutableList.of(
                  new TableFieldSchema().setName("result").setType("STRING"),
                  new TableFieldSchema()
                      .setName("records")
                      .setMode("REPEATED")
                      .setType("RECORD")
                      .setFields(
                          ImmutableList.of(
                              new TableFieldSchema().setName("index").setType("INTEGER"),
                              new TableFieldSchema().setName("value").setType("STRING")))));
  NexmarkUtils.console("Writing results to BigQuery table %s", tableSpec);
  BigQueryIO.Write io =
      BigQueryIO.write()
          .to(tableSpec)
          .withSchema(tableSchema)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND);
  formattedResults
      .apply(queryName + ".StringToTableRow", ParDo.of(new StringToTableRow()))
      .apply(queryName + ".WriteBigQueryResults", io);
}
 
Example #19
Source File: BigQueryTable.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public POutput buildIOWriter(PCollection<Row> input) {
  return input.apply(
      BigQueryIO.<Row>write()
          .withSchema(BigQueryUtils.toTableSchema(getSchema()))
          .withFormatFunction(BigQueryUtils.toTableRow())
          .withWriteDisposition(writeDisposition)
          .to(bqLocation));
}
 
Example #20
Source File: Spec11Pipeline.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/** Deploys the spec11 pipeline as a template on GCS. */
public void deploy() {
  // We can't store options as a member variable due to serialization concerns.
  Spec11PipelineOptions options = PipelineOptionsFactory.as(Spec11PipelineOptions.class);
  options.setProject(projectId);
  options.setRunner(DataflowRunner.class);
  // This causes p.run() to stage the pipeline as a template on GCS, as opposed to running it.
  options.setTemplateLocation(spec11TemplateUrl);
  options.setStagingLocation(beamStagingUrl);
  // This credential is used when Dataflow deploys the template to GCS in target GCP project.
  // So, make sure the credential has write permission to GCS in that project.
  options.setGcpCredential(googleCredentials);

  Pipeline p = Pipeline.create(options);
  PCollection<Subdomain> domains =
      p.apply(
          "Read active domains from BigQuery",
          BigQueryIO.read(Subdomain::parseFromRecord)
              .fromQuery(
                  SqlTemplate.create(getQueryFromFile(Spec11Pipeline.class, "subdomains.sql"))
                      .put("PROJECT_ID", projectId)
                      .put("DATASTORE_EXPORT_DATASET", "latest_datastore_export")
                      .put("REGISTRAR_TABLE", "Registrar")
                      .put("DOMAIN_BASE_TABLE", "DomainBase")
                      .build())
              .withCoder(SerializableCoder.of(Subdomain.class))
              .usingStandardSql()
              .withoutValidation()
              .withTemplateCompatibility());

  evaluateUrlHealth(
      domains,
      new EvaluateSafeBrowsingFn(options.getSafeBrowsingApiKey(), retrier),
      options.getDate());
  p.run();
}
 
Example #21
Source File: BigQueryInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin in) {
    BigQueryIO.TypedRead<TableRow> bigQueryIOPTransform;
    switch (dataset.sourceType.getValue()) {
    case TABLE_NAME: {
        TableReference table = new TableReference();
        table.setProjectId(datastore.projectName.getValue());
        table.setDatasetId(dataset.bqDataset.getValue());
        table.setTableId(dataset.tableName.getValue());
        // TODO use {@link #BigQueryIO.read(SerializableFunction)} instead of readTableRows for good performance
        // avoid redundance type convert, but take care of each filed type value when apply
        bigQueryIOPTransform = BigQueryIO.readTableRows().from(table);
        break;
    }
    case QUERY: {
        // TODO use {@link #BigQueryIO.read(SerializableFunction)} instead of readTableRows for good performance
        // reduce redundance type convert, but take care of each filed type value when apply
        bigQueryIOPTransform = BigQueryIO.readTableRows().fromQuery(dataset.query.getValue());
        if (!dataset.useLegacySql.getValue()) {
            bigQueryIOPTransform = bigQueryIOPTransform.usingStandardSql();
        } else {
            // need to consider flattenResults only for legacy sql,
            // stand sql don't support flatten result, legacy sql support flatten result by default
            // withoutResultFlattening on legacy sql is not working well till fix schema issue,
            // BigQueryDatasetRuntime.getSchema use flatten result indeed
            // bigQueryIOPTransform = bigQueryIOPTransform.withoutResultFlattening();
        }
        break;
    }
    default:
        throw new RuntimeException("To be implemented: " + dataset.sourceType.getValue());
    }

    return in
            .apply(bigQueryIOPTransform)
            .apply(ParDo.of(new TableRowToIndexedRecordFn(defaultOutputCoder.getSchema())))
            .setCoder(defaultOutputCoder);
}
 
Example #22
Source File: JoinExamples.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);
  // the following two 'applys' create multiple inputs to our pipeline, one for each
  // of our two input sources.
  PCollection<TableRow> eventsTable =
      p.apply(BigQueryIO.readTableRows().from(GDELT_EVENTS_TABLE));
  PCollection<TableRow> countryCodes = p.apply(BigQueryIO.readTableRows().from(COUNTRY_CODES));
  PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
  formattedResults.apply(TextIO.write().to(options.getOutput()));
  p.run().waitUntilFinish();
}
 
Example #23
Source File: BigQueryOutputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<IndexedRecord> in) {
    TableReference table = new TableReference();
    table.setProjectId(datastore.projectName.getValue());
    table.setDatasetId(dataset.bqDataset.getValue());
    table.setTableId(dataset.tableName.getValue());

    BigQueryIO.Write bigQueryIOPTransform = BigQueryIO.writeTableRows().to(table);

    bigQueryIOPTransform = setTableOperation(bigQueryIOPTransform);
    bigQueryIOPTransform = setWriteOperation(bigQueryIOPTransform);

    in.apply(ParDo.of(new IndexedRecordToTableRowFn())).apply(bigQueryIOPTransform);
    return PDone.in(in.getPipeline());
}
 
Example #24
Source File: TriggerExample.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  TrafficFlowOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(TrafficFlowOptions.class);
  options.setStreaming(true);

  options.setBigQuerySchema(getSchema());

  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);

  TableReference tableRef =
      getTableReference(
          options.getProject(), options.getBigQueryDataset(), options.getBigQueryTable());

  PCollectionList<TableRow> resultList =
      pipeline
          .apply("ReadMyFile", TextIO.read().from(options.getInput()))
          .apply("InsertRandomDelays", ParDo.of(new InsertDelays()))
          .apply(ParDo.of(new ExtractFlowInfo()))
          .apply(new CalculateTotalFlow(options.getWindowDuration()));

  for (int i = 0; i < resultList.size(); i++) {
    resultList.get(i).apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(getSchema()));
  }

  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exits.
  exampleUtils.waitToFinish(result);
}
 
Example #25
Source File: TrafficMaxLaneFlow.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void runTrafficMaxLaneFlow(TrafficMaxLaneFlowOptions options) throws IOException {
  // Using ExampleUtils to set up required resources.
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);
  TableReference tableRef = new TableReference();
  tableRef.setProjectId(options.getProject());
  tableRef.setDatasetId(options.getBigQueryDataset());
  tableRef.setTableId(options.getBigQueryTable());

  pipeline
      .apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
      // row... => <station route, station speed> ...
      .apply(ParDo.of(new ExtractFlowInfoFn()))
      // map the incoming data stream into sliding windows.
      .apply(
          Window.into(
              SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration()))
                  .every(Duration.standardMinutes(options.getWindowSlideEvery()))))
      .apply(new MaxLaneFlow())
      .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatMaxesFn.getSchema()));

  // Run the pipeline.
  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example #26
Source File: TrafficRoutes.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void runTrafficRoutes(TrafficRoutesOptions options) throws IOException {
  // Using ExampleUtils to set up required resources.
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);
  TableReference tableRef = new TableReference();
  tableRef.setProjectId(options.getProject());
  tableRef.setDatasetId(options.getBigQueryDataset());
  tableRef.setTableId(options.getBigQueryTable());

  pipeline
      .apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
      // row... => <station route, station speed> ...
      .apply(ParDo.of(new ExtractStationSpeedFn()))
      // map the incoming data stream into sliding windows.
      .apply(
          Window.into(
              SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration()))
                  .every(Duration.standardMinutes(options.getWindowSlideEvery()))))
      .apply(new TrackSpeed())
      .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatStatsFn.getSchema()));

  // Run the pipeline.
  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example #27
Source File: StreamingWordExtract.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Sets up and starts streaming pipeline.
 *
 * @throws IOException if there is a problem setting up resources
 */
public static void main(String[] args) throws IOException {
  StreamingWordExtractOptions options =
      PipelineOptionsFactory.fromArgs(args)
          .withValidation()
          .as(StreamingWordExtractOptions.class);
  options.setStreaming(true);

  options.setBigQuerySchema(StringToRowConverter.getSchema());
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);

  String tableSpec =
      new StringBuilder()
          .append(options.getProject())
          .append(":")
          .append(options.getBigQueryDataset())
          .append(".")
          .append(options.getBigQueryTable())
          .toString();
  pipeline
      .apply("ReadLines", TextIO.read().from(options.getInputFile()))
      .apply(ParDo.of(new ExtractWords()))
      .apply(ParDo.of(new Uppercase()))
      .apply(ParDo.of(new StringToRowConverter()))
      .apply(
          BigQueryIO.writeTableRows().to(tableSpec).withSchema(StringToRowConverter.getSchema()));

  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example #28
Source File: WriteWindowedToBigQuery.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #29
Source File: WriteToBigQuery.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #30
Source File: BigQueryHllSketchCompatibilityIT.java    From beam with Apache License 2.0 5 votes vote down vote up
private void writeSketchToBigQuery(List<String> testData, String expectedChecksum) {
  String tableSpec = String.format("%s.%s", DATASET_ID, SKETCH_TABLE_ID);
  String query =
      String.format("SELECT HLL_COUNT.EXTRACT(%s) FROM %s", SKETCH_FIELD_NAME, tableSpec);
  TableSchema tableSchema =
      new TableSchema()
          .setFields(
              Collections.singletonList(
                  new TableFieldSchema().setName(SKETCH_FIELD_NAME).setType(SKETCH_FIELD_TYPE)));

  TestPipelineOptions options =
      TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
  Pipeline p = Pipeline.create(options);
  p.apply(Create.of(testData).withType(TypeDescriptor.of(String.class)))
      .apply(HllCount.Init.forStrings().globally())
      .apply(
          BigQueryIO.<byte[]>write()
              .to(tableSpec)
              .withSchema(tableSchema)
              .withFormatFunction(
                  sketch ->
                      // Empty sketch is represented by empty byte array in Beam and by null in
                      // BigQuery
                      new TableRow().set(SKETCH_FIELD_NAME, sketch.length == 0 ? null : sketch))
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
  p.run().waitUntilFinish();

  // BigqueryMatcher will send a query to retrieve the estimated count and verifies its
  // correctness using checksum.
  assertThat(
      createQueryUsingStandardSql(APP_NAME, PROJECT_ID, query),
      queryResultHasChecksum(expectedChecksum));
}