org.apache.beam.sdk.io.gcp.pubsub.PubsubIO Java Examples

The following examples show how to use org.apache.beam.sdk.io.gcp.pubsub.PubsubIO. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DatastoreToPubsub.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF, and sends the JSON to a Pubsub topic.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToPubsubOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToPubsubOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(ReadJsonEntities.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(PubsubIO.writeStrings()
          .to(options.getPubsubWriteTopic()));

  pipeline.run();
}
 
Example #2
Source File: PubSubToGCS.java    From java-docs-samples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
  // The maximum number of shards when writing output.
  int numShards = 1;

  PubSubToGCSOptions options = PipelineOptionsFactory
    .fromArgs(args)
    .withValidation()
    .as(PubSubToGCSOptions.class);

  options.setStreaming(true);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
    // 1) Read string messages from a Pub/Sub topic.
    .apply("Read PubSub Messages", PubsubIO.readStrings().fromTopic(options.getInputTopic()))
    // 2) Group the messages into fixed-sized minute intervals.
    .apply(Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))))
    // 3) Write one file to GCS for every window of messages.
    .apply("Write Files to GCS", new WriteOneFilePerWindow(options.getOutput(), numShards));

  // Execute the pipeline and wait until it finishes running.
  pipeline.run().waitUntilFinish();
}
 
Example #3
Source File: TextToPubsubStream.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
    .apply(
      "Read Text Data",
      TextIO.read()
        .from(options.getInputFilePattern())
        .watchForNewFiles(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
    .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example #4
Source File: TextToPubsub.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
      .apply("Read Text Data", TextIO.read().from(options.getInputFilePattern()))
      .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example #5
Source File: PubsubJsonIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testWritesJsonRowsToPubsub() throws Exception {
  Schema personSchema =
      Schema.builder()
          .addStringField("name")
          .addInt32Field("height")
          .addBooleanField("knowsJavascript")
          .build();
  PCollection<Row> rows =
      pipeline
          .apply(
              Create.of(
                  row(personSchema, "person1", 80, true),
                  row(personSchema, "person2", 70, false),
                  row(personSchema, "person3", 60, true),
                  row(personSchema, "person4", 50, false),
                  row(personSchema, "person5", 40, true)))
          .setRowSchema(personSchema)
          .apply(
              SqlTransform.query(
                  "SELECT name FROM PCOLLECTION AS person WHERE person.knowsJavascript"));

  // Convert rows to JSON and write to pubsub
  rows.apply(ToJson.of()).apply(PubsubIO.writeStrings().to(eventsTopic.topicPath().getPath()));

  pipeline.run().waitUntilFinish(Duration.standardMinutes(5));

  eventsTopic
      .assertThatTopicEventuallyReceives(
          messageLike("{\"name\":\"person1\"}"),
          messageLike("{\"name\":\"person3\"}"),
          messageLike("{\"name\":\"person5\"}"))
      .waitForUpTo(Duration.standardSeconds(20));
}
 
Example #6
Source File: PubsubIOJsonTable.java    From beam with Apache License 2.0 5 votes vote down vote up
private PubsubIO.Write<PubsubMessage> createPubsubMessageWrite() {
  PubsubIO.Write<PubsubMessage> write = PubsubIO.writeMessages().to(config.getTopic());
  if (config.useTimestampAttribute()) {
    write = write.withTimestampAttribute(config.getTimestampAttribute());
  }
  return write;
}
 
Example #7
Source File: PubsubIOJsonTable.java    From beam with Apache License 2.0 5 votes vote down vote up
private PubsubIO.Write<PubsubMessage> writeMessagesToDlq() {
  PubsubIO.Write<PubsubMessage> write = PubsubIO.writeMessages().to(config.getDeadLetterQueue());

  return config.useTimestampAttribute()
      ? write.withTimestampAttribute(config.getTimestampAttribute())
      : write;
}
 
Example #8
Source File: PubsubIOJsonTable.java    From beam with Apache License 2.0 5 votes vote down vote up
private PubsubIO.Read<PubsubMessage> readMessagesWithAttributes() {
  PubsubIO.Read<PubsubMessage> read =
      PubsubIO.readMessagesWithAttributes().fromTopic(config.getTopic());

  return config.useTimestampAttribute()
      ? read.withTimestampAttribute(config.getTimestampAttribute())
      : read;
}
 
Example #9
Source File: NexmarkLauncher.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Return source of events from Pubsub. */
private PCollection<Event> sourceEventsFromPubsub(Pipeline p) {
  NexmarkUtils.console("Reading events from Pubsub %s", pubsubSubscription);

  PubsubIO.Read<PubsubMessage> io =
      PubsubIO.readMessagesWithAttributes()
          .fromSubscription(pubsubSubscription)
          .withIdAttribute(NexmarkUtils.PUBSUB_ID);
  if (!configuration.usePubsubPublishTime) {
    io = io.withTimestampAttribute(NexmarkUtils.PUBSUB_TIMESTAMP);
  }

  return p.apply(queryName + ".ReadPubsubEvents", io)
      .apply(queryName + ".PubsubMessageToEvent", ParDo.of(new PubsubMessageEventDoFn()));
}
 
Example #10
Source File: StatefulTeamScore.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);

    pipeline
        // Read game events from Pub/Sub using custom timestamps, which are extracted from the
        // pubsub data elements, and parse the data.
        .apply(
            PubsubIO.readStrings()
                .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE)
                .fromTopic(options.getTopic()))
        .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
        // Create <team, GameActionInfo> mapping. UpdateTeamScore uses team name as key.
        .apply(
            "MapTeamAsKey",
            MapElements.into(
                    TypeDescriptors.kvs(
                        TypeDescriptors.strings(), TypeDescriptor.of(GameActionInfo.class)))
                .via((GameActionInfo gInfo) -> KV.of(gInfo.team, gInfo)))
        // Outputs a team's score every time it passes a new multiple of the threshold.
        .apply("UpdateTeamScore", ParDo.of(new UpdateTeamScoreFn(options.getThresholdScore())))
        // Write the results to BigQuery.
        .apply(
            "WriteTeamLeaders",
            new WriteWindowedToBigQuery<>(
                options.as(GcpOptions.class).getProject(),
                options.getDataset(),
                options.getLeaderBoardTableName() + "_team_leader",
                configureCompleteWindowedTableWrite()));

    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
  }
 
Example #11
Source File: NexmarkLauncher.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Send {@code events} to Pubsub. */
private void sinkEventsToPubsub(PCollection<Event> events) {
  checkState(pubsubTopic != null, "Pubsub topic needs to be set up before initializing sink");
  NexmarkUtils.console("Writing events to Pubsub %s", pubsubTopic);

  PubsubIO.Write<PubsubMessage> io =
      PubsubIO.writeMessages().to(pubsubTopic).withIdAttribute(NexmarkUtils.PUBSUB_ID);
  if (!configuration.usePubsubPublishTime) {
    io = io.withTimestampAttribute(NexmarkUtils.PUBSUB_TIMESTAMP);
  }

  events
      .apply(queryName + ".EventToPubsubMessage", ParDo.of(new EventPubsubMessageDoFn()))
      .apply(queryName + ".WritePubsubEvents", io);
}
 
Example #12
Source File: NexmarkLauncher.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Send {@code formattedResults} to Pubsub. */
private void sinkResultsToPubsub(PCollection<String> formattedResults, long now) {
  String shortTopic = shortTopic(now);
  NexmarkUtils.console("Writing results to Pubsub %s", shortTopic);
  PubsubIO.Write<String> io =
      PubsubIO.writeStrings().to(shortTopic).withIdAttribute(NexmarkUtils.PUBSUB_ID);
  if (!configuration.usePubsubPublishTime) {
    io = io.withTimestampAttribute(NexmarkUtils.PUBSUB_TIMESTAMP);
  }
  formattedResults.apply(queryName + ".WritePubsubResults", io);
}
 
Example #13
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToPubSubMessage", ParDo.of(new FailedStringToPubsubMessageFn()))
      .apply("WriteFailedRecordsToPubSub", PubsubIO.writeMessages().to(errorRecordsTopic()));
}
 
Example #14
Source File: PubsubToDatastore.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs a pipeline which reads in JSON from Pubsub, feeds the JSON to a Javascript UDF,
 * and writes the JSON encoded Entities to Datastore.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  PubsubToDatastoreOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(PubsubToDatastoreOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(PubsubIO.readStrings()
          .fromTopic(options.getPubsubReadTopic()))
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(WriteJsonEntities.newBuilder()
          .setProjectId(options.getDatastoreWriteProjectId())
          .build());

  pipeline.run();
}
 
Example #15
Source File: PubsubToPubsub.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  /**
   * Steps:
   *      1) Read PubSubMessage with attributes from input PubSub subscription.
   *      2) Apply any filters if an attribute=value pair is provided.
   *      3) Write each PubSubMessage to output PubSub topic.
   */
  pipeline
      .apply(
          "Read PubSub Events",
          PubsubIO.readMessagesWithAttributes().fromSubscription(options.getInputSubscription()))
      .apply(
          "Filter Events If Enabled",
          ParDo.of(
              ExtractAndFilterEventsFn.newBuilder()
                  .withFilterKey(options.getFilterKey())
                  .withFilterValue(options.getFilterValue())
                  .build()))
      .apply("Write PubSub Events", PubsubIO.writeMessages().to(options.getOutputTopic()));

  // Execute the pipeline and return the result.
  return pipeline.run();
}
 
Example #16
Source File: FhirIOReadIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testFhirIORead() throws Exception {
  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  FhirIO.Read.Result result =
      pipeline
          .apply(PubsubIO.readStrings().fromSubscription(pubsubSubscription))
          .apply(FhirIO.readResources());

  PCollection<String> resources = result.getResources();
  resources.apply(
      "waitForAnyMessage", signal.signalSuccessWhen(resources.getCoder(), anyResources -> true));
  // wait for any resource

  Supplier<Void> start = signal.waitForStart(Duration.standardMinutes(5));
  pipeline.apply(signal.signalStart());
  PipelineResult job = pipeline.run();
  start.get();
  signal.waitForSuccess(Duration.standardSeconds(30));

  // A runner may not support cancel
  try {
    job.cancel();
  } catch (UnsupportedOperationException exc) {
    // noop
  }
}
 
Example #17
Source File: PubsubToText.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return  The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *   1) Read string messages from PubSub
   *   2) Window the messages into minute intervals specified by the executor.
   *   3) Output the windowed files to GCS
   */
  pipeline
      .apply("Read PubSub Events", PubsubIO.readStrings().fromTopic(options.getInputTopic()))
      .apply(
          options.getWindowDuration() + " Window",
          Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration()))))

      // Apply windowed file writes. Use a NestedValueProvider because the filename
      // policy requires a resourceId generated from the input value at runtime.
      .apply(
          "Write File(s)",
          TextIO.write()
              .withWindowedWrites()
              .withNumShards(options.getNumShards())
              .to(
                  new WindowedFilenamePolicy(
                      options.getOutputDirectory(),
                      options.getOutputFilenamePrefix(),
                      options.getOutputShardTemplate(),
                      options.getOutputFilenameSuffix()))
              .withTempDirectory(NestedValueProvider.of(
                  maybeUseUserTempLocation(
                      options.getUserTempLocation(),
                      options.getOutputDirectory()),
                  (SerializableFunction<String, ResourceId>) input ->
                      FileBasedSink.convertToFileResourceIfPossible(input))));

  // Execute the pipeline and return the result.
  return pipeline.run();
}
 
Example #18
Source File: CdcPCollectionsFetchers.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
public Map<String, PCollection<Row>> changelogPcollections(Pipeline p) {
  Map<String, PCollection<Row>> result = new HashMap<>();

  List<TopicSubscriptionSchema> readSourceSchemas = buildTopicSubscriptionSchemas(
      options.as(GcpOptions.class).getProject(),
      options.getInputTopics(),
      options.getInputSubscriptions());

  for (TopicSubscriptionSchema rss: readSourceSchemas) {
    String transformTopicPrefix = rss.topic;

    PCollection<PubsubMessage> pubsubData;
    if (rss.subscription == null) {
      pubsubData = p.apply(
          String.format("%s/Read Updates from PubSub", transformTopicPrefix),
          PubsubIO.readMessagesWithAttributes()
              .fromTopic(String.format(
                  "projects/%s/topics/%s",
                  options.as(GcpOptions.class).getProject(), rss.topic)));
    } else {
      pubsubData = p.apply(
          String.format("%s/Read Updates from PubSub", transformTopicPrefix),
          PubsubIO.readMessagesWithAttributes().fromSubscription(String.format(
              "projects/%s/subscriptions/%s",
              options.as(GcpOptions.class).getProject(), rss.subscription)));
    }

    PCollection<Row> collectionOfRows = pubsubData
        .apply(String.format("%s/Extract payload", transformTopicPrefix),
            MapElements.into(TypeDescriptor.of(byte[].class))
                .via(PubsubMessage::getPayload))
        .apply(
            String.format("%s/Decode", transformTopicPrefix),
            DecodeRows.withSchema(rss.schema));

    result.put(transformTopicPrefix, collectionOfRows);
  }
  return result;
}
 
Example #19
Source File: PubSubInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin in) {
    PCollection<PubsubMessage> pubsubMessages = null;
    if (properties.useMaxNumRecords.getValue() || properties.useMaxReadTime.getValue()) {
        pubsubMessages = in.apply(Create.of(dataset.subscription.getValue())).apply(
                ParDo.of(new BoundedReaderFn(properties, runOnDataflow)));
    } else {// normal
        PubsubIO.Read<PubsubMessage> pubsubRead =
                PubsubIO.readMessages().fromSubscription(String.format("projects/%s/subscriptions/%s",
                        datastore.projectName.getValue(), dataset.subscription.getValue()));
        if (properties.idLabel.getValue() != null && !"".equals(properties.idLabel.getValue())) {
            pubsubRead.withIdAttribute(properties.idLabel.getValue());
        }
        if (properties.timestampLabel.getValue() != null && !"".equals(properties.timestampLabel.getValue())) {
            pubsubRead.withTimestampAttribute(properties.timestampLabel.getValue());
        }

        pubsubMessages = in.apply(pubsubRead);
    }

    switch (dataset.valueFormat.getValue()) {
    case AVRO: {
        Schema schema = new Schema.Parser().parse(dataset.avroSchema.getValue());
        return pubsubMessages.apply(ParDo.of(new ConvertToAvro(schema.toString()))).setCoder(
                getDefaultOutputCoder());
    }
    case CSV: {
        return (PCollection<IndexedRecord>) pubsubMessages
                .apply(ParDo.of(new ExtractCsvSplit(dataset.fieldDelimiter.getValue())))
                .apply((PTransform) ConvertToIndexedRecord.of());
    }
    default:
        throw new RuntimeException("To be implemented: " + dataset.valueFormat.getValue());

    }
}
 
Example #20
Source File: PubSubOutputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<IndexedRecord> in) {
    PubSubDatasetProperties dataset = properties.getDatasetProperties();
    PubSubDatastoreProperties datastore = dataset.getDatastoreProperties();

    prepareTopicSubscription(properties);

    PubsubIO.Write<PubsubMessage> pubsubWrite = PubsubIO.writeMessages().to(
            String.format("projects/%s/topics/%s", datastore.projectName.getValue(), dataset.topic.getValue()));

    if (properties.idLabel.getValue() != null && !"".equals(properties.idLabel.getValue())) {
        pubsubWrite.withIdAttribute(properties.idLabel.getValue());
    }
    if (properties.timestampLabel.getValue() != null && !"".equals(properties.timestampLabel.getValue())) {
        pubsubWrite.withTimestampAttribute(properties.timestampLabel.getValue());
    }

    switch (dataset.valueFormat.getValue()) {
    case CSV: {
        return in.apply(MapElements.via(new FormatCsv(dataset.fieldDelimiter.getValue()))).apply(pubsubWrite);
    }
    case AVRO: {
        return in.apply(MapElements.via(new FormatAvro())).apply(pubsubWrite);
    }
    default:
        throw new RuntimeException("To be implemented: " + dataset.valueFormat.getValue());
    }

}
 
Example #21
Source File: StreamingDataGenerator.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(StreamingDataGeneratorOptions options) {

  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Trigger at the supplied QPS
   *  2) Generate messages containing fake data
   *  3) Write messages to Pub/Sub
   */
  pipeline
      .apply(
          "Trigger",
          GenerateSequence.from(0L).withRate(options.getQps(), Duration.standardSeconds(1L)))
      .apply("GenerateMessages", ParDo.of(new MessageGeneratorFn(options.getSchemaLocation())))
      .apply("WriteToPubsub", PubsubIO.writeMessages().to(options.getTopic()));

  return pipeline.run();
}
 
Example #22
Source File: PubsubWordCount.java    From cloud-bigtable-examples with Apache License 2.0 5 votes vote down vote up
/**
 * <p>Creates a dataflow pipeline that creates the following chain:</p>
 * <ol>
 *   <li> Reads from a Cloud Pubsub topic
 *   <li> Window into fixed windows of 1 minute
 *   <li> Applies word count transform
 *   <li> Creates Puts from each of the word counts in the array
 *   <li> Performs a Bigtable Put on the items
 * </ol>
 *
 * @param args Arguments to use to configure the Dataflow Pipeline.  The first three are required
 *   when running via managed resource in Google Cloud Platform.  Those options should be omitted
 *   for LOCAL runs.  The next four arguments are to configure the Bigtable connection. The last
 *   two items are for Cloud Pubsub.
 *        --runner=BlockingDataflowPipelineRunner
 *        --project=[dataflow project] \\
 *        --stagingLocation=gs://[your google storage bucket] \\
 *        --bigtableProjectId=[bigtable project] \\
 *        --bigtableInstanceId=[bigtable instance id] \\
 *        --bigtableTableId=[bigtable tableName]
 *        --inputFile=[file path on GCS]
 *        --pubsubTopic=projects/[project name]/topics/[topic name]
 */

public static void main(String[] args) throws Exception {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  BigtablePubsubOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtablePubsubOptions.class);

  // CloudBigtableTableConfiguration contains the project, instance and table to connect to.
  CloudBigtableTableConfiguration config =
      new CloudBigtableTableConfiguration.Builder()
      .withProjectId(options.getBigtableProjectId())
      .withInstanceId(options.getBigtableInstanceId())
      .withTableId(options.getBigtableTableId())
      .build();

  // In order to cancel the pipelines automatically,
  // DataflowPipelineRunner is forced to be used.
  // Also enables the 2 jobs to run at the same time.
  options.setRunner(DataflowRunner.class);

  options.as(DataflowPipelineOptions.class).setStreaming(true);
  Pipeline p = Pipeline.create(options);

  FixedWindows window = FixedWindows.of(Duration.standardMinutes(options.getWindowSize()));

  p
      .apply(PubsubIO.readStrings().fromTopic(options.getPubsubTopic()))
      .apply(Window.<String> into(window))
      .apply(ParDo.of(new ExtractWordsFn()))
      .apply(Count.<String> perElement())
      .apply(ParDo.of(MUTATION_TRANSFORM))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run().waitUntilFinish();
  // Start a second job to inject messages into a Cloud Pubsub topic
  injectMessages(options);
}
 
Example #23
Source File: StatefulTeamScore.java    From deployment-examples with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);

    pipeline
        // Read game events from Pub/Sub using custom timestamps, which are extracted from the
        // pubsub data elements, and parse the data.
        .apply(
            PubsubIO.readStrings()
                .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE)
                .fromTopic(options.getTopic()))
        .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
        // Create <team, GameActionInfo> mapping. UpdateTeamScore uses team name as key.
        .apply(
            "MapTeamAsKey",
            MapElements.into(
                    TypeDescriptors.kvs(
                        TypeDescriptors.strings(), TypeDescriptor.of(GameActionInfo.class)))
                .via((GameActionInfo gInfo) -> KV.of(gInfo.team, gInfo)))
        // Outputs a team's score every time it passes a new multiple of the threshold.
        .apply("UpdateTeamScore", ParDo.of(new UpdateTeamScoreFn(options.getThresholdScore())))
        // Write the results to BigQuery.
        .apply(
            "WriteTeamLeaders",
            new WriteWindowedToBigQuery<>(
                options.as(GcpOptions.class).getProject(),
                options.getDataset(),
                options.getLeaderBoardTableName() + "_team_leader",
                configureCompleteWindowedTableWrite()));

    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
  }
 
Example #24
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public WithFailures.Result<PDone, PubsubMessage> expand(PCollection<PubsubMessage> input) {
  PDone done = input //
      .apply(CompressPayload.of(compression).withMaxCompressedBytes(maxCompressedBytes)) //
      .apply(PubsubConstraints.truncateAttributes()) //
      .apply(PubsubIO.writeMessages().to(topic));
  return WithFailures.Result.of(done, EmptyErrors.in(input.getPipeline()));
}
 
Example #25
Source File: Read.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PBegin input) {
  return input //
      .apply(PubsubIO.readMessagesWithAttributesAndMessageId().fromSubscription(subscription))
      .apply(MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via(message -> {
        Map<String, String> attributesWithMessageId = new HashMap<>(message.getAttributeMap());
        attributesWithMessageId.put(Attribute.MESSAGE_ID, message.getMessageId());
        return new PubsubMessage(message.getPayload(), attributesWithMessageId);
      }));
}
 
Example #26
Source File: PubsubWordCount.java    From cloud-bigtable-examples with Apache License 2.0 5 votes vote down vote up
private static void injectMessages(BigtablePubsubOptions options) {
  String inputFile = options.getInputFile();
  String topic = options.getPubsubTopic();
  DataflowPipelineOptions copiedOptions = options.as(DataflowPipelineOptions.class);
  copiedOptions.setStreaming(false);
  copiedOptions.setNumWorkers(INJECTORNUMWORKERS);
  copiedOptions.setJobName(copiedOptions.getJobName() + "-injector");
  Pipeline injectorPipeline = Pipeline.create(copiedOptions);
  injectorPipeline.apply(TextIO.read().from(inputFile))
      .apply(ParDo.of(new FilterEmptyStringsFn()))
      .apply(PubsubIO.writeStrings().to(topic));
  injectorPipeline.run().waitUntilFinish();
}
 
Example #27
Source File: StreamingBeamSQL.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var project = options.as(GcpOptions.class).getProject();
  var subscription = ProjectSubscriptionName.of(project, options.getInputSubscription()).toString();

  var schema = Schema.builder()
      .addStringField("url")
      .addDoubleField("page_score")
      .addDateTimeField("processing_time")
      .build();

  var pipeline = Pipeline.create(options);
  pipeline
      // Read, parse, and validate messages from Pub/Sub.
      .apply("Read messages from Pub/Sub", PubsubIO.readStrings().fromSubscription(subscription))
      .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class)).via(message -> {
        // This is a good place to add error handling.
        // The first transform should act as a validation layer to make sure
        // that any data coming to the processing pipeline must be valid.
        // See `MapElements.MapWithFailures` for more details.
        LOG.info("message: {}", message);
        var msg = GSON.fromJson(message, PageReviewMessage.class);
        return Row.withSchema(schema).addValues(
            msg.url,                                    // row url
            msg.review.equals("positive") ? 1.0 : 0.0,  // row page_score
            new Instant()                               // row processing_time
        ).build();
      })).setRowSchema(schema) // make sure to set the row schema for the PCollection

      // Add timestamps and bundle elements into windows.
      .apply("Add processing time", WithTimestamps.of((row) -> row.getDateTime("processing_time").toInstant()))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      // Apply a SQL query for every window of elements.
      .apply("Run Beam SQL query", SqlTransform.query(
          "SELECT " +
          "  url, " +
          "  COUNT(page_score) AS num_reviews, " +
          "  AVG(page_score) AS score, " +
          "  MIN(processing_time) AS first_date, " +
          "  MAX(processing_time) AS last_date " +
          "FROM PCOLLECTION " +
          "GROUP BY url"
      ))

      // Convert the SQL Rows into BigQuery TableRows and write them to BigQuery.
      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)).via(row -> {
        LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"), row.getString("url"),
            row.getInt64("num_reviews"));
        return new TableRow()
            .set("url", row.getString("url"))
            .set("num_reviews", row.getInt64("num_reviews"))
            .set("score", row.getDouble("score"))
            .set("first_date", row.getDateTime("first_date").toInstant().toString())
            .set("last_date", row.getDateTime("last_date").toInstant().toString());
      }))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              // To learn more about the valid BigQuery types:
              //   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("num_reviews").setType("INTEGER"),
              new TableFieldSchema().setName("score").setType("FLOAT64"),
              new TableFieldSchema().setName("first_date").setType("TIMESTAMP"),
              new TableFieldSchema().setName("last_date").setType("TIMESTAMP"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}
 
Example #28
Source File: SyntheticDataPublisher.java    From beam with Apache License 2.0 4 votes vote down vote up
private static void writeToPubSub(PCollection<KV<byte[], byte[]>> collection) {
  collection
      .apply("Map to PubSub messages", MapElements.via(new MapBytesToPubSubMessage()))
      .apply("Write to PubSub", PubsubIO.writeMessages().to(options.getPubSubTopic()));
}
 
Example #29
Source File: ControlPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {
	
	ControlPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(ControlPipelineOptions.class);
	PipelineOptionsFactory.register(ControlPipelineOptions.class);
	
	Pipeline pipeline = Pipeline.create(options);
	
	if (options.isControlGCS()) {
		
		// Read commands from GCS file(s)

		final Bounded<String> read = org.apache.beam.sdk.io.Read.from(
			new RecordFileSource<String>(ValueProvider.StaticValueProvider.of(options.getControlGCSPath()), 
				StringUtf8Coder.of(), RecordFileSource.DEFAULT_RECORD_SEPARATOR));
		pipeline
			.apply("Read", read)
			.apply("Process Commands",ParDo.of(new ProcessCommand()));	

		
	} else if (options.isControlPubsub()){

		options.setStreaming(true);
		
		// Accept commands from a Control Pub/Sub topic
		pipeline
			.apply("Read from control topic",
				PubsubIO.readStrings().fromTopic(options.getControlPubsubTopic()))
			.apply("Process Commands",ParDo.of(new ProcessCommand()));		
		
		
		/* This section will eventually work with the 0.19.0-alpha
		 * and later versions of the ideomatic Java client google-cloud
		 * But for now remove this check.

		String subscriptionId = "indexercommands_controller";
		String topicId = options.getControlPubsubTopic();
		String projectId = options.getProject();
		String subscriptionPath = "projects/"+projectId+"/subscriptions/"+subscriptionId;

		// Support legacy way of passing the Control Topic that included the whole path
		if (topicId.startsWith("projects/")) {
			String[] tokens = topicId.split("/");
			topicId = tokens[tokens.length-1];
		}
		
		SubscriptionAdminClient subscriptionAdminClient = SubscriptionAdminClient.create();
		SubscriptionName subscriptionName = SubscriptionName.create(projectId, subscriptionId);
		TopicName topicName = TopicName.create(projectId, topicId);
		
		Subscription subscription;
	    try {
	        subscription = subscriptionAdminClient.getSubscription(subscriptionName);
	    } catch (Exception e) {
	    	subscription = null;
	    }
	    
	    if (subscription == null) {
	    	try {
	    		// create a pull subscription
	    		subscription = subscriptionAdminClient.createSubscription(
	    				subscriptionName, topicName, PushConfig.getDefaultInstance(), 60);
	    	} catch (Exception e) {
	    		LOG.error(e.getMessage());
	    		System.exit(1);
	    	}
	    }
		*/
		/*
		PubsubClient pubsubClient = PubsubJsonClient.FACTORY.newClient(null, null, options.as(DataflowPipelineOptions.class));
		PubsubClient.ProjectPath pp = PubsubClient.projectPathFromPath("projects/"+options.getProject());
		PubsubClient.TopicPath tp = PubsubClient.topicPathFromPath(options.getControlPubsubTopic());
		PubsubClient.SubscriptionPath sp = PubsubClient.subscriptionPathFromPath(subscriptionPath);
		
		List<PubsubClient.SubscriptionPath> l = pubsubClient.listSubscriptions(pp, tp);
		if (!l.contains(sp))
			pubsubClient.createSubscription(tp, sp, 60);
		*/
		
	}
	
	pipeline.run();

}
 
Example #30
Source File: LeaderBoard.java    From beam with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);

    // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub
    // data elements, and parse the data.
    PCollection<GameActionInfo> gameEvents =
        pipeline
            .apply(
                PubsubIO.readStrings()
                    .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE)
                    .fromTopic(options.getTopic()))
            .apply("ParseGameEvent", ParDo.of(new ParseEventFn()));

    gameEvents
        .apply(
            "CalculateTeamScores",
            new CalculateTeamScores(
                Duration.standardMinutes(options.getTeamWindowDuration()),
                Duration.standardMinutes(options.getAllowedLateness())))
        // Write the results to BigQuery.
        .apply(
            "WriteTeamScoreSums",
            new WriteWindowedToBigQuery<>(
                options.as(GcpOptions.class).getProject(),
                options.getDataset(),
                options.getLeaderBoardTableName() + "_team",
                configureWindowedTableWrite()));
    gameEvents
        .apply(
            "CalculateUserScores",
            new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness())))
        // Write the results to BigQuery.
        .apply(
            "WriteUserScoreSums",
            new WriteToBigQuery<>(
                options.as(GcpOptions.class).getProject(),
                options.getDataset(),
                options.getLeaderBoardTableName() + "_user",
                configureGlobalWindowBigQueryWrite()));

    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
  }