Java Code Examples for org.apache.beam.sdk.Pipeline#create()

The following examples show how to use org.apache.beam.sdk.Pipeline#create() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testMultiGraphPipelineSerialization() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  Pipeline p = Pipeline.create(options);

  PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3));

  input.apply(new UnrelatedOutputCreator());
  input.apply(new UnboundOutputCreator());

  DataflowPipelineTranslator t =
      DataflowPipelineTranslator.fromOptions(
          PipelineOptionsFactory.as(DataflowPipelineOptions.class));

  // Check that translation doesn't fail.
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  JobSpecification jobSpecification =
      t.translate(
          p,
          pipelineProto,
          sdkComponents,
          DataflowRunner.fromOptions(options),
          Collections.emptyList());
  assertAllStepOutputsHaveUniqueIds(jobSpecification.getJob());
}
 
Example 2
Source File: DisplayDataEvaluator.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Traverse the specified {@link PTransform}, collecting {@link DisplayData} registered on the
 * inner primitive {@link PTransform PTransforms}.
 *
 * @param root The root {@link PTransform} to traverse
 * @param inputCoder The coder to set for the {@link PTransform} input, or null to infer the
 *     default coder.
 * @return the set of {@link DisplayData} for primitive {@link PTransform PTransforms}.
 */
public <InputT> Set<DisplayData> displayDataForPrimitiveTransforms(
    final PTransform<? super PCollection<InputT>, ? extends POutput> root,
    Coder<InputT> inputCoder) {

  Create.Values<InputT> input;
  if (inputCoder != null) {
    input = Create.empty(inputCoder);
  } else {
    // These types don't actually work, but the pipeline will never be run
    input = (Create.Values<InputT>) Create.empty(VoidCoder.of());
  }

  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply("Input", input).apply("Transform", root);

  return displayDataForPipeline(pipeline, root);
}
 
Example 3
Source File: V1WriteIT.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * An end-to-end test for {@link DatastoreV1.Write}.
 *
 * <p>Write some test entities to Cloud Datastore. Read and count all the entities. Verify that
 * the count matches the number of entities written.
 */
@Test
public void testE2EV1Write() throws Exception {
  Pipeline p = Pipeline.create(options);

  // Write to datastore
  p.apply(GenerateSequence.from(0).to(numEntities))
      .apply(ParDo.of(new CreateEntityFn(options.getKind(), options.getNamespace(), ancestor, 0)))
      .apply(DatastoreIO.v1().write().withProjectId(project));

  p.run();

  // Count number of entities written to datastore.
  long numEntitiesWritten = countEntities(options, project, ancestor);

  assertEquals(numEntities, numEntitiesWritten);
}
 
Example 4
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that the {@link DataflowRunner} with {@code --templateLocation} throws the appropriate
 * exception when an output file is not writable.
 */
@Test
public void testTemplateRunnerLoggedErrorForFile() throws Exception {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setJobName("TestJobName");
  options.setRunner(DataflowRunner.class);
  options.setTemplateLocation("//bad/path");
  options.setProject("test-project");
  options.setRegion(REGION_ID);
  options.setTempLocation(tmpFolder.getRoot().getPath());
  options.setGcpCredential(new TestCredential());
  options.setPathValidatorClass(NoopPathValidator.class);
  Pipeline p = Pipeline.create(options);

  thrown.expectMessage("Cannot create output file at");
  thrown.expect(RuntimeException.class);
  p.run();
}
 
Example 5
Source File: Task.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> wordsStartingWithA =
      pipeline.apply("Words starting with A",
          Create.of("apple", "ant", "arrow")
      );

  PCollection<String> wordsStartingWithB =
      pipeline.apply("Words starting with B",
          Create.of("ball", "book", "bow")
      );

  PCollection<String> output = applyTransform(wordsStartingWithA, wordsStartingWithB);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 6
Source File: DatastoreToPubsub.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF, and sends the JSON to a Pubsub topic.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToPubsubOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToPubsubOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(ReadJsonEntities.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(PubsubIO.writeStrings()
          .to(options.getPubsubWriteTopic()));

  pipeline.run();
}
 
Example 7
Source File: TrafficRoutes.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void runTrafficRoutes(TrafficRoutesOptions options) throws IOException {
  // Using ExampleUtils to set up required resources.
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);
  TableReference tableRef = new TableReference();
  tableRef.setProjectId(options.getProject());
  tableRef.setDatasetId(options.getBigQueryDataset());
  tableRef.setTableId(options.getBigQueryTable());

  pipeline
      .apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
      // row... => <station route, station speed> ...
      .apply(ParDo.of(new ExtractStationSpeedFn()))
      // map the incoming data stream into sliding windows.
      .apply(
          Window.into(
              SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration()))
                  .every(Duration.standardMinutes(options.getWindowSlideEvery()))))
      .apply(new TrackSpeed())
      .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatStatsFn.getSchema()));

  // Run the pipeline.
  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example 8
Source File: TimestampExtractTransformTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
@Test(timeout = 10000)
public void testTransform() {
  Pipeline p = Pipeline.create();
  PCollection<Integer> input = p.apply(Create.of(1, 2, 3));
  PCollection<KV<Integer, Long>> result =
      input.apply(
          TimestampExtractTransform.of(
              in -> CountByKey.of(in).keyBy(KV::getValue, TypeDescriptors.integers()).output()));
  PAssert.that(result).containsInAnyOrder(KV.of(1, 1L), KV.of(2, 1L), KV.of(3, 1L));
  p.run().waitUntilFinish();
}
 
Example 9
Source File: PipelineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testRequirements() {
  Pipeline pipeline = Pipeline.create();
  pipeline.apply(Create.of(1, 2, 3)).apply(ParDo.of(new DoFnRequiringStableInput()));
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, false);
  assertThat(
      pipelineProto.getRequirementsList(), hasItem(ParDoTranslation.REQUIRES_STABLE_INPUT_URN));
}
 
Example 10
Source File: GcsKmsKeyIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests writing to tempLocation with --dataflowKmsKey set on the command line. Verifies that
 * resulting output uses specified key and is readable. Does not verify any temporary files.
 *
 * <p>This test verifies that GCS file copies work with CMEK-enabled files.
 */
@Test
public void testGcsWriteWithKmsKey() {
  TestPipelineOptions options =
      TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
  assertNotNull(options.getTempRoot());
  options.setTempLocation(options.getTempRoot() + "/testGcsWriteWithKmsKey");
  GcsOptions gcsOptions = options.as(GcsOptions.class);

  ResourceId filenamePrefix =
      FileSystems.matchNewResource(gcsOptions.getGcpTempLocation(), true)
          .resolve(
              String.format("GcsKmsKeyIT-%tF-%<tH-%<tM-%<tS-%<tL.output", new Date()),
              StandardResolveOptions.RESOLVE_FILE);

  Pipeline p = Pipeline.create(options);
  p.apply("ReadLines", TextIO.read().from(INPUT_FILE))
      .apply("WriteLines", TextIO.write().to(filenamePrefix));

  PipelineResult result = p.run();
  State state = result.waitUntilFinish();
  assertThat(state, equalTo(State.DONE));

  String filePattern = filenamePrefix + "*-of-*";
  assertThat(new NumberedShardedFile(filePattern), fileContentsHaveChecksum(EXPECTED_CHECKSUM));

  // Verify objects have KMS key set.
  try {
    MatchResult matchResult =
        Iterables.getOnlyElement(FileSystems.match(Collections.singletonList(filePattern)));
    GcsUtil gcsUtil = gcsOptions.getGcsUtil();
    for (Metadata metadata : matchResult.metadata()) {
      String kmsKey =
          gcsUtil.getObject(GcsPath.fromUri(metadata.resourceId().toString())).getKmsKeyName();
      assertNotNull(kmsKey);
    }
  } catch (IOException e) {
    throw new AssertionError(e);
  }
}
 
Example 11
Source File: FlinkSubmissionTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** The Flink program which is executed by the CliFrontend. */
public static void main(String[] args) {
  FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
  options.setRunner(FlinkRunner.class);
  options.setStreaming(streaming);
  options.setParallelism(1);
  Pipeline p = Pipeline.create(options);
  p.apply(GenerateSequence.from(0).to(1));
  p.run();
}
 
Example 12
Source File: CsvImport.java    From cloud-bigtable-examples with Apache License 2.0 5 votes vote down vote up
/**
 * <p>Creates a dataflow pipeline that reads a file and creates the following chain:</p>
 * <ol>
 * <li> Put each row of the CSV into the Pipeline.
 * <li> Creates a Put object for each row.
 * <li> Write the Put object to Bigtable.
 * </ol>
 *
 * @param args Arguments to use to configure the Dataflow Pipeline.  The first three are required
 * when running via managed resource in Google Cloud Platform.  Those options should be omitted
 * for LOCAL runs.  The next two are to configure your CSV file. And the last four arguments are
 * to configure the Bigtable connection. --runner=BlockingDataflowPipelineRunner
 * --project=[dataflow project] \\ --stagingLocation=gs://[your google storage bucket] \\
 * --headers=[comma separated list of headers] \\ --inputFile=gs://[your google storage object] \\
 * --bigtableProject=[bigtable project] \\ --bigtableInstanceId=[bigtable instance id] \\
 * --bigtableTableId=[bigtable tableName]
 */

public static void main(String[] args) throws IllegalArgumentException {
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);

  if (options.getInputFile().equals("")) {
    throw new IllegalArgumentException("Please provide value for inputFile.");
  }
  if (options.getHeaders().equals("")) {
    throw new IllegalArgumentException("Please provide value for headers.");
  }

  CloudBigtableTableConfiguration config =
      new CloudBigtableTableConfiguration.Builder()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId())
          .build();

  Pipeline p = Pipeline.create(options);

  p.apply("ReadMyFile", TextIO.read().from(options.getInputFile()))
      .apply("TransformParsingsToBigtable", ParDo.of(MUTATION_TRANSFORM))
      .apply("WriteToBigtable", CloudBigtableIO.writeToTable(config));

  p.run().waitUntilFinish();
}
 
Example 13
Source File: DatastoreToBigQuery.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF that returns JSON that conforms to the BigQuery TableRow spec and writes
 * the TableRows to BigQuery.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToBigQueryOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToBigQueryOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(
          ReadJsonEntities.newBuilder()
              .setGqlQuery(options.getDatastoreReadGqlQuery())
              .setProjectId(options.getDatastoreReadProjectId())
              .setNamespace(options.getDatastoreReadNamespace())
              .build())
      .apply(
          TransformTextViaJavascript.newBuilder()
              .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setFunctionName(options.getJavascriptTextTransformFunctionName())
              .build())
      .apply(BigQueryConverters.jsonToTableRow())
      .apply(
          "WriteBigQuery",
          BigQueryIO.writeTableRows()
              .withoutValidation()
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER)
              .to(options.getOutputTableSpec())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)
              .withCustomGcsTempLocation(options.getBigQueryLoadingTemporaryDirectory()));

  pipeline.run();
}
 
Example 14
Source File: ParDoTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void beforeClass() {
  SparkStructuredStreamingPipelineOptions options =
      PipelineOptionsFactory.create().as(SparkStructuredStreamingPipelineOptions.class);
  options.setRunner(SparkStructuredStreamingRunner.class);
  options.setTestMode(true);
  pipeline = Pipeline.create(options);
}
 
Example 15
Source File: TextStreamingPipeline.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }
 
Example 16
Source File: ResumeFromCheckpointStreamingTest.java    From beam with Apache License 2.0 4 votes vote down vote up
private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
  KafkaIO.Read<String, Instant> read =
      KafkaIO.<String, Instant>read()
          .withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList())
          .withTopics(Collections.singletonList(TOPIC))
          .withKeyDeserializer(StringDeserializer.class)
          .withValueDeserializer(InstantDeserializer.class)
          .withConsumerConfigUpdates(ImmutableMap.of("auto.offset.reset", "earliest"))
          .withTimestampFn(KV::getValue)
          .withWatermarkFn(
              kv -> {
                // at EOF move WM to infinity.
                String key = kv.getKey();
                Instant instant = kv.getValue();
                return "EOF".equals(key) ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
              });

  TestSparkPipelineOptions options =
      PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
  options.setSparkMaster("local[*]");
  options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
  options.setExpectedAssertions(expectedAssertions);
  options.setRunner(TestSparkRunner.class);
  options.setEnableSparkMetricSinks(false);
  options.setForceStreaming(true);
  options.setCheckpointDir(temporaryFolder.getRoot().getPath());
  // timeout is per execution so it can be injected by the caller.
  if (stopWatermarkOption.isPresent()) {
    options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
  }

  Pipeline p = Pipeline.create(options);

  PCollection<String> expectedCol =
      p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
  PCollectionView<List<String>> view = expectedCol.apply(View.asList());

  PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());

  PCollection<Iterable<String>> grouped =
      kafkaStream
          .apply(Keys.create())
          .apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view))
          .apply(
              Window.<String>into(FixedWindows.of(Duration.millis(500)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .accumulatingFiredPanes()
                  .withAllowedLateness(Duration.ZERO))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create());

  grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));

  return (SparkPipelineResult) p.run();
}
 
Example 17
Source File: CassandraToBigtable.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs a pipeline to copy one Cassandra table to Cloud Bigtable.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {

  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

  // Split the Cassandra Hosts value provider into a list value provider.
  ValueProvider.NestedValueProvider<List<String>, String> hosts =
      ValueProvider.NestedValueProvider.of(
          options.getCassandraHosts(),
          (SerializableFunction<String, List<String>>) value -> Arrays.asList(value.split(",")));

  Pipeline p = Pipeline.create(options);

  // Create a factory method to inject the CassandraRowMapperFn to allow custom type mapping.
  SerializableFunction<Session, Mapper> cassandraObjectMapperFactory =
      new CassandraRowMapperFactory(options.getCassandraTable(), options.getCassandraKeyspace());

  CassandraIO.Read<Row> source =
      CassandraIO.<Row>read()
          .withHosts(hosts)
          .withPort(options.getCassandraPort())
          .withKeyspace(options.getCassandraKeyspace())
          .withTable(options.getCassandraTable())
          .withMapperFactoryFn(cassandraObjectMapperFactory)
          .withEntity(Row.class)
          .withCoder(SerializableCoder.of(Row.class));

  BigtableIO.Write sink =
      BigtableIO.write()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId());

  p.apply("Read from Cassandra", source)
      .apply(
          "Convert Row",
          ParDo.of(
              BeamRowToBigtableFn.createWithSplitLargeRows(
                  options.getRowKeySeparator(),
                  options.getDefaultColumnFamily(),
                  options.getSplitLargeRows(),
                  BeamRowToBigtableFn.MAX_MUTATION_PER_REQUEST)))
      .apply("Write to Bigtable", sink);
  p.run();
}
 
Example 18
Source File: BigQueryNestedRecordsIT.java    From beam with Apache License 2.0 4 votes vote down vote up
private static void runPipeline(Options options) throws Exception {
  // Create flattened and unflattened collections via Dataflow, via normal and side input
  // paths.
  Pipeline p = Pipeline.create(options);
  BigQueryOptions bigQueryOptions = options.as(BigQueryOptions.class);

  PCollection<TableRow> flattenedCollection =
      p.apply("ReadFlattened", BigQueryIO.readTableRows().fromQuery(options.getInput()));
  PCollection<TableRow> nonFlattenedCollection =
      p.apply(
          "ReadNonFlattened",
          BigQueryIO.readTableRows().fromQuery(options.getInput()).withoutResultFlattening());
  PCollection<TableRow> unflattenableCollection =
      p.apply(
          "ReadUnflattenable",
          BigQueryIO.readTableRows()
              .fromQuery(options.getUnflattenableInput())
              .withoutResultFlattening());

  // Also query BigQuery directly.
  BigqueryClient bigQueryClient = new BigqueryClient(bigQueryOptions.getAppName());

  TableRow queryFlattenedTyped =
      bigQueryClient
          .queryWithRetries(options.getInput(), bigQueryOptions.getProject(), true)
          .getRows()
          .get(0);

  TableRow queryUnflattened =
      bigQueryClient
          .queryUnflattened(options.getInput(), bigQueryOptions.getProject(), true)
          .get(0);

  TableRow queryUnflattenable =
      bigQueryClient
          .queryUnflattened(options.getUnflattenableInput(), bigQueryOptions.getProject(), true)
          .get(0);

  // Verify that the results are the same.
  PAssert.thatSingleton(flattenedCollection).isEqualTo(queryFlattenedTyped);
  PAssert.thatSingleton(nonFlattenedCollection).isEqualTo(queryUnflattened);
  PAssert.thatSingleton(unflattenableCollection).isEqualTo(queryUnflattenable);

  PAssert.thatSingleton(flattenedCollection).notEqualTo(queryUnflattened);
  p.run().waitUntilFinish();
}
 
Example 19
Source File: WindowRuntimeTest.java    From components with Apache License 2.0 4 votes vote down vote up
@Test
public void testSessionWindow() {
    PipelineOptions options = PipelineOptionsFactory.create();
    options.setRunner(DirectRunner.class);
    final Pipeline p = Pipeline.create(options);

    /*
     * // creation of PCollection with different timestamp PCollection<IndexedRecord>
     */
    List<TimestampedValue<IndexedRecord>> data = Arrays.asList( //
            TimestampedValue.of(irA, new Instant(0L)), //
            TimestampedValue.of(irB, new Instant(0L)), //
            TimestampedValue.of(irC, new Instant(1L)), //
            TimestampedValue.of(irA, new Instant(2L)), //
            TimestampedValue.of(irA, new Instant(2L)), //
            TimestampedValue.of(irB, new Instant(2L)), //
            TimestampedValue.of(irB, new Instant(30L)), //
            TimestampedValue.of(irA, new Instant(30L)), //
            TimestampedValue.of(irA, new Instant(50L)), //
            TimestampedValue.of(irC, new Instant(55L)), //
            TimestampedValue.of(irA, new Instant(59L)));

    Create.TimestampedValues<IndexedRecord> pt = Create.timestamped(data);
    pt = (Create.TimestampedValues<IndexedRecord>) pt.withCoder(LazyAvroCoder.of());
    PCollection<IndexedRecord> input = p.apply(pt);

    WindowProperties windowProperties = new WindowProperties("window");
    windowProperties.setValue("windowLength", 10);
    windowProperties.setValue("windowSlideLength", -1);
    windowProperties.setValue("windowSession", true);

    WindowRuntime windowRun = new WindowRuntime();
    windowRun.initialize(null, windowProperties);

    PCollection<IndexedRecord> test = windowRun.expand(input);

    PCollection<KV<IndexedRecord, Long>> windowed_counts = test.apply(Count.<IndexedRecord> perElement());

    // window duration: 4 - sliding: 2
    PAssert.that(windowed_counts).containsInAnyOrder( //
            KV.of(irA, 3L), //
            KV.of(irB, 2L), //
            KV.of(irC, 1L), //

            KV.of(irB, 1L), //
            KV.of(irA, 1L), //

            KV.of(irA, 2L), //
            KV.of(irC, 1L));

    p.run();
}
 
Example 20
Source File: KafkaToBigQuery.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var pipeline = Pipeline.create(options);
  pipeline
      .apply("Read messages from Kafka",
          KafkaIO.<String, String>read()
              .withBootstrapServers(options.getBootstrapServer())
              .withTopic(options.getInputTopic())
              .withKeyDeserializer(StringDeserializer.class)
              .withValueDeserializer(StringDeserializer.class)
              .withoutMetadata())
      .apply("Get message contents", Values.<String>create())
      .apply("Log messages", MapElements.into(TypeDescriptor.of(String.class))
          .via(message -> {
            LOG.info("Received: {}", message);
            return message;
          }))
      .apply("Parse JSON", MapElements.into(TypeDescriptor.of(PageRating.class))
          .via(message -> GSON.fromJson(message, PageRating.class)))

      .apply("Add processing time", WithTimestamps.of((pageRating) -> new Instant(pageRating.processingTime)))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class))
          .via(pageRating -> new TableRow()
              .set("processing_time", pageRating.processingTime.toString())
              .set("url", pageRating.url)
              .set("rating", pageRating.rating)))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"),
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("rating").setType("STRING"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}