Java Code Examples for org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: feast   Source File: BigQueryDeadletterSink.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<FailedElement> input) {
  TimePartitioning partition = new TimePartitioning().setType("DAY");
  partition.setField(TIMESTAMP_COLUMN);
  input
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withTimePartitioning(partition)
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(input.getPipeline());
}
 
Example 2
Source Project: DataflowTemplates   Source File: BigQueryConverters.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin begin) {
  return begin
      .apply(
          "AvroToEntity",
          BigQueryIO.read(
                  AvroToEntity.newBuilder()
                      .setEntityKind(entityKind())
                      .setUniqueNameColumn(uniqueNameColumn())
                      .setNamespace(namespace())
                      .build())
              .fromQuery(query())
              .withoutValidation()
              .withTemplateCompatibility()
              .usingStandardSql())
      .apply(
          "CheckNoKey",
          CheckNoKey.newBuilder()
              .setFailureTag(failureTag())
              .setSuccessTag(successTag())
              .build());
}
 
Example 3
Source Project: dataflow-opinion-analysis   Source File: IndexerPipeline.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * @param options
 * @param pipeline
 * @param readContent
 * @return
 */
private static PCollection<InputContent> filterAlreadyProcessedUrls(
		PCollection<InputContent> readContent, Pipeline pipeline, 
		IndexerPipelineOptions options) {
	PCollection<InputContent> contentToProcess;
	String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
	PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
		.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
		.apply(ParDo.of(new GetUrlFn()));

	final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
		alreadyProcessedUrls.apply(View.<String,Long>asMap());
	  
	contentToProcess = readContent
		.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
			.withSideInputs(alreadyProcessedUrlsSideInput));
	return contentToProcess;
}
 
Example 4
Source Project: beam   Source File: FilterExamples.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);

    TableSchema schema = buildWeatherSchemaProjection();

    p.apply(BigQueryIO.readTableRows().from(options.getInput()))
        .apply(ParDo.of(new ProjectionFn()))
        .apply(new BelowGlobalMean(options.getMonthFilter()))
        .apply(
            BigQueryIO.writeTableRows()
                .to(options.getOutput())
                .withSchema(schema)
                .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

    p.run().waitUntilFinish();
  }
 
Example 5
Source Project: beam   Source File: CombinePerKeyExamples.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);

    // Build the table schema for the output table.
    List<TableFieldSchema> fields = new ArrayList<>();
    fields.add(new TableFieldSchema().setName("word").setType("STRING"));
    fields.add(new TableFieldSchema().setName("all_plays").setType("STRING"));
    TableSchema schema = new TableSchema().setFields(fields);

    p.apply(BigQueryIO.readTableRows().from(options.getInput()))
        .apply(new PlaysForWord())
        .apply(
            BigQueryIO.writeTableRows()
                .to(options.getOutput())
                .withSchema(schema)
                .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

    p.run().waitUntilFinish();
  }
 
Example 6
Source Project: beam   Source File: MaxPerKeyExamples.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);

    // Build the table schema for the output table.
    List<TableFieldSchema> fields = new ArrayList<>();
    fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
    fields.add(new TableFieldSchema().setName("max_mean_temp").setType("FLOAT"));
    TableSchema schema = new TableSchema().setFields(fields);

    p.apply(BigQueryIO.readTableRows().from(options.getInput()))
        .apply(new MaxMeanTemp())
        .apply(
            BigQueryIO.writeTableRows()
                .to(options.getOutput())
                .withSchema(schema)
                .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

    p.run().waitUntilFinish();
  }
 
Example 7
Source Project: beam   Source File: BigQueryIOIT.java    License: Apache License 2.0 6 votes vote down vote up
private void testWrite(BigQueryIO.Write<byte[]> writeIO, String metricName) {
  Pipeline pipeline = Pipeline.create(options);

  BigQueryIO.Write.Method method = BigQueryIO.Write.Method.valueOf(options.getWriteMethod());
  pipeline
      .apply("Read from source", Read.from(new SyntheticBoundedSource(sourceOptions)))
      .apply("Gather time", ParDo.of(new TimeMonitor<>(NAMESPACE, metricName)))
      .apply("Map records", ParDo.of(new MapKVToV()))
      .apply(
          "Write to BQ",
          writeIO
              .to(tableQualifier)
              .withCustomGcsTempLocation(ValueProvider.StaticValueProvider.of(tempRoot))
              .withMethod(method)
              .withSchema(
                  new TableSchema()
                      .setFields(
                          Collections.singletonList(
                              new TableFieldSchema().setName("data").setType("BYTES")))));

  PipelineResult pipelineResult = pipeline.run();
  pipelineResult.waitUntilFinish();
  extractAndPublishTime(pipelineResult, metricName);
}
 
Example 8
public PCollection<TableRow> expand(PCollection<KV<String,Double>> input) {
    PCollection<TableRow> output = input.
      apply( "aggregateToTableRow", ParDo.of(new DoFn<KV<String, Double>, TableRow>() {
          @ProcessElement
          public void processElement(ProcessContext c) {
              KV<String, Double> e = c.element();

              TableRow row = new TableRow()
                .set("destination", e.getKey())
                .set("aggResponseTime", e.getValue());

              c.output(row);
          }
      }));

    output.apply("tableRowToBigQuery", BigQueryIO.writeTableRows()
      .to(this.tableName)
      .withSchema(createTableSchema(this.tableSchema))
      .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
      .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

    return output;
}
 
Example 9
Source Project: components   Source File: BigQueryOutputRuntime.java    License: Apache License 2.0 6 votes vote down vote up
private BigQueryIO.Write setWriteOperation(BigQueryIO.Write bigQueryIOPTransform) {
    if (properties.tableOperation.getValue() == BigQueryOutputProperties.TableOperation.NONE
            || properties.tableOperation
                    .getValue() == BigQueryOutputProperties.TableOperation.CREATE_IF_NOT_EXISTS) {
        switch (properties.writeOperation.getValue()) {
        case APPEND:
            bigQueryIOPTransform =
                    bigQueryIOPTransform.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND);
            break;
        case WRITE_TO_EMPTY:
            bigQueryIOPTransform =
                    bigQueryIOPTransform.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_EMPTY);
            break;
        default:
            throw new RuntimeException("To be implemented: " + properties.writeOperation.getValue());
        }
    } else {
        if (properties.writeOperation.getValue() != null) {
            LOG.info("Write operation " + properties.writeOperation.getValue()
                    + " be ignored when Table operation is " + properties.tableOperation.getValue());
        }
    }
    return bigQueryIOPTransform;
}
 
Example 10
/**
 * <p>Creates a dataflow pipeline that creates the following chain:</p>
 * <ol>
 *   <li> Gets the records into the Pipeline
 *   <li> Creates Puts from each of the records
 *   <li> Performs a Bigtable Put on the records
 * </ol>
 *
 * @param args Arguments to use to configure the Dataflow Pipeline.  The first three are required
 *   when running via managed resource in Google Cloud Platform.  Those options should be omitted
 *   for LOCAL runs.  The last four arguments are to configure the Bigtable connection.
 *        --runner=BlockingDataflowPipelineRunner
 *        --project=[dataflow project] \\
 *        --stagingLocation=gs://[your google storage bucket] \\
 *        --bigtableProject=[bigtable project] \\
 *        --bigtableInstanceId=[bigtable instance id] \\
 *        --bigtableTableId=[bigtable tableName]
 *
 * <p>Note:The Hbase-Bigtable client currently supports upto 100K columns in a single {@link Put}.
 *       If your data is exceeding 100K columns, please create multiple {@link Put} objects.
 */

public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  BigQueryBigtableTransferOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigQueryBigtableTransferOptions.class);

  // CloudBigtableTableConfiguration contains the project, instance and table to connect to.
  CloudBigtableTableConfiguration config =
      new CloudBigtableTableConfiguration.Builder()
      .withProjectId(options.getBigtableProjectId())
      .withInstanceId(options.getBigtableInstanceId())
      .withTableId(options.getBigtableTableId())
      .build();

  Pipeline p = Pipeline.create(options);

  p
      .apply(BigQueryIO.read().from("ReadSourceTable").fromQuery(options.getBqQuery())
          .usingStandardSql())
      .apply(ParDo.of(MUTATION_TRANSFORM))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run().waitUntilFinish();

}
 
Example 11
Source Project: quetzal   Source File: BigQueryLoader.java    License: Eclipse Public License 2.0 6 votes vote down vote up
public static void write(String table, TableSchema schema, PCollection<JSONObject> data) {
	data.apply("convert to TableRow", ParDo.of(new DoFn<JSONObject,TableRow>() {
		private static final long serialVersionUID = -4204128594221801617L;
		@SuppressWarnings("unchecked")
		@ProcessElement
		public void processElement(ProcessContext c) {
			JSONObject obj = c.element();
			TableRow x = new TableRow();
			obj.keySet().forEach((Object key) -> {
				x.set((String) key, obj.get(key));
			});
			c.output(x);
		}
	})).apply(BigQueryIO.Write
			.withTableDescription(table)
			.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)
			.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
			.withSchema(schema)
			.to(table));
}
 
Example 12
Source Project: feast   Source File: WriteFailedElementToBigQuery.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailedElement> failedElements) {
  return failedElements
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example 13
Source Project: deployment-examples   Source File: WriteToBigQuery.java    License: MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example 14
Source Project: deployment-examples   Source File: WriteWindowedToBigQuery.java    License: MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example 15
Source Project: DataflowTemplates   Source File: ErrorConverters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example 16
Source Project: DataflowTemplates   Source File: ErrorConverters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example 17
Source Project: DataflowTemplates   Source File: ErrorConverters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example 18
Source Project: DataflowTemplates   Source File: BigQueryConverters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<TableRow> expand(PBegin pipeline) {

  if (options().getQuery() == null) {
    LOG.info("No query provided, reading directly from: " + options().getInputTableSpec());
    return pipeline.apply(
        "ReadFromBigQuery",
        BigQueryIO.readTableRows()
            .from(options().getInputTableSpec())
            .withTemplateCompatibility()
            .withMethod(Method.DIRECT_READ)
            .withCoder(TableRowJsonCoder.of()));

  } else {
    LOG.info("Using query: " + options().getQuery());

    if (!options().getUseLegacySql()) {

      LOG.info("Using Standard SQL");
      return pipeline.apply(
          "ReadFromBigQueryWithQuery",
          BigQueryIO.readTableRows()
              .fromQuery(options().getQuery())
              .withTemplateCompatibility()
              .usingStandardSql()
              .withCoder(TableRowJsonCoder.of()));
    } else {

      LOG.info("Using Legacy SQL");
      return pipeline.apply(
          "ReadFromBigQueryWithQuery",
          BigQueryIO.readTableRows()
              .fromQuery(options().getQuery())
              .withTemplateCompatibility()
              .withCoder(TableRowJsonCoder.of()));
    }
  }
}
 
Example 19
Source Project: DataflowTemplates   Source File: KafkaToBigQuery.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(
    PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example 20
Source Project: DataflowTemplates   Source File: ErrorConverters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example 21
Source Project: DataflowTemplates   Source File: ErrorConverters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example 22
Source Project: DataflowTemplates   Source File: DatastoreToBigQuery.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF that returns JSON that conforms to the BigQuery TableRow spec and writes
 * the TableRows to BigQuery.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToBigQueryOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToBigQueryOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(
          ReadJsonEntities.newBuilder()
              .setGqlQuery(options.getDatastoreReadGqlQuery())
              .setProjectId(options.getDatastoreReadProjectId())
              .setNamespace(options.getDatastoreReadNamespace())
              .build())
      .apply(
          TransformTextViaJavascript.newBuilder()
              .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setFunctionName(options.getJavascriptTextTransformFunctionName())
              .build())
      .apply(BigQueryConverters.jsonToTableRow())
      .apply(
          "WriteBigQuery",
          BigQueryIO.writeTableRows()
              .withoutValidation()
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER)
              .to(options.getOutputTableSpec())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)
              .withCustomGcsTempLocation(options.getBigQueryLoadingTemporaryDirectory()));

  pipeline.run();
}
 
Example 23
public static void main(String[] args) {
    PipelineOptionsFactory.register(TemplateOptions.class);
    TemplateOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(TemplateOptions.class);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply("READ", TextIO.read().from(options.getInputFile()))
            .apply("TRANSFORM", ParDo.of(new WikiParDo()))
            .apply("WRITE", BigQueryIO.writeTableRows()
                    .to(String.format("%s:dotc_2018.wiki_demo", options.getProject()))
                    .withCreateDisposition(CREATE_IF_NEEDED)
                    .withWriteDisposition(WRITE_APPEND)
                    .withSchema(getTableSchema()));
    pipeline.run();
}
 
Example 24
Source Project: dataflow-opinion-analysis   Source File: IndexerPipeline.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param contentToIndexNotSkipped
 * @param contentNotToIndexSkipped
 * @param pipeline
 * @param options
 * @return
 */
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
		PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
		Pipeline pipeline, IndexerPipelineOptions options) {
	PCollection<KV<String,Long>> alreadyProcessedDocs = null;
	
	if (!options.getWriteTruncate()) {
		String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
		alreadyProcessedDocs = pipeline
			.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
			.apply(ParDo.of(new GetDocumentHashFn()));

	} else {
		Map<String, Long> map = new HashMap<String,Long>();
		alreadyProcessedDocs = pipeline
			.apply("Create empty side input of Docs",
				Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
	}			
	
	final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =  
		alreadyProcessedDocs.apply(View.<String,Long>asMap());
	
	PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
		.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
		.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
		.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
			.withSideInputs(alreadyProcessedDocsSideInput)
			.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
				TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection	
	
	PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
	PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
	
	// Merge the sets of items that are dupes or skipped
	PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
	
	ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
	return content;
}
 
Example 25
Source Project: beam   Source File: WriteToBigQuery.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example 26
Source Project: beam   Source File: WriteWindowedToBigQuery.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example 27
Source Project: beam   Source File: StreamingWordExtract.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Sets up and starts streaming pipeline.
 *
 * @throws IOException if there is a problem setting up resources
 */
public static void main(String[] args) throws IOException {
  StreamingWordExtractOptions options =
      PipelineOptionsFactory.fromArgs(args)
          .withValidation()
          .as(StreamingWordExtractOptions.class);
  options.setStreaming(true);

  options.setBigQuerySchema(StringToRowConverter.getSchema());
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);

  String tableSpec =
      new StringBuilder()
          .append(options.getProject())
          .append(":")
          .append(options.getBigQueryDataset())
          .append(".")
          .append(options.getBigQueryTable())
          .toString();
  pipeline
      .apply("ReadLines", TextIO.read().from(options.getInputFile()))
      .apply(ParDo.of(new ExtractWords()))
      .apply(ParDo.of(new Uppercase()))
      .apply(ParDo.of(new StringToRowConverter()))
      .apply(
          BigQueryIO.writeTableRows().to(tableSpec).withSchema(StringToRowConverter.getSchema()));

  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example 28
Source Project: beam   Source File: TrafficRoutes.java    License: Apache License 2.0 5 votes vote down vote up
public static void runTrafficRoutes(TrafficRoutesOptions options) throws IOException {
  // Using ExampleUtils to set up required resources.
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);
  TableReference tableRef = new TableReference();
  tableRef.setProjectId(options.getProject());
  tableRef.setDatasetId(options.getBigQueryDataset());
  tableRef.setTableId(options.getBigQueryTable());

  pipeline
      .apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
      // row... => <station route, station speed> ...
      .apply(ParDo.of(new ExtractStationSpeedFn()))
      // map the incoming data stream into sliding windows.
      .apply(
          Window.into(
              SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration()))
                  .every(Duration.standardMinutes(options.getWindowSlideEvery()))))
      .apply(new TrackSpeed())
      .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatStatsFn.getSchema()));

  // Run the pipeline.
  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example 29
Source Project: beam   Source File: TrafficMaxLaneFlow.java    License: Apache License 2.0 5 votes vote down vote up
public static void runTrafficMaxLaneFlow(TrafficMaxLaneFlowOptions options) throws IOException {
  // Using ExampleUtils to set up required resources.
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);
  TableReference tableRef = new TableReference();
  tableRef.setProjectId(options.getProject());
  tableRef.setDatasetId(options.getBigQueryDataset());
  tableRef.setTableId(options.getBigQueryTable());

  pipeline
      .apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
      // row... => <station route, station speed> ...
      .apply(ParDo.of(new ExtractFlowInfoFn()))
      // map the incoming data stream into sliding windows.
      .apply(
          Window.into(
              SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration()))
                  .every(Duration.standardMinutes(options.getWindowSlideEvery()))))
      .apply(new MaxLaneFlow())
      .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatMaxesFn.getSchema()));

  // Run the pipeline.
  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example 30
Source Project: beam   Source File: TriggerExample.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  TrafficFlowOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(TrafficFlowOptions.class);
  options.setStreaming(true);

  options.setBigQuerySchema(getSchema());

  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);

  TableReference tableRef =
      getTableReference(
          options.getProject(), options.getBigQueryDataset(), options.getBigQueryTable());

  PCollectionList<TableRow> resultList =
      pipeline
          .apply("ReadMyFile", TextIO.read().from(options.getInput()))
          .apply("InsertRandomDelays", ParDo.of(new InsertDelays()))
          .apply(ParDo.of(new ExtractFlowInfo()))
          .apply(new CalculateTotalFlow(options.getWindowDuration()));

  for (int i = 0; i < resultList.size(); i++) {
    resultList.get(i).apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(getSchema()));
  }

  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exits.
  exampleUtils.waitToFinish(result);
}