Java Code Examples for org.apache.beam.sdk.Pipeline#run()

The following examples show how to use org.apache.beam.sdk.Pipeline#run() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Task.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Event> events =
      pipeline.apply(
          Create.of(
              new Event("1", "book-order", DateTime.parse("2019-06-01T00:00:00+00:00")),
              new Event("2", "pencil-order", DateTime.parse("2019-06-02T00:00:00+00:00")),
              new Event("3", "paper-order", DateTime.parse("2019-06-03T00:00:00+00:00")),
              new Event("4", "pencil-order", DateTime.parse("2019-06-04T00:00:00+00:00")),
              new Event("5", "book-order", DateTime.parse("2019-06-05T00:00:00+00:00"))
          )
      );

  PCollection<Event> output = applyTransform(events);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 2
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUploadGraph() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();
  options.setExperiments(Arrays.asList("upload_graph"));
  Pipeline p = buildDataflowPipeline(options);
  DataflowPipelineJob job = (DataflowPipelineJob) p.run();

  ArgumentCaptor<Job> jobCaptor = ArgumentCaptor.forClass(Job.class);
  Mockito.verify(mockJobs).create(eq(PROJECT_ID), eq(REGION_ID), jobCaptor.capture());
  assertValidJob(jobCaptor.getValue());
  assertTrue(jobCaptor.getValue().getSteps().isEmpty());
  assertTrue(
      jobCaptor
          .getValue()
          .getStepsLocation()
          .startsWith("gs://valid-bucket/temp/staging/dataflow_graph"));
}
 
Example 3
Source File: ReadSourceStreamingTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

    Pipeline p = FlinkTestPipeline.createForStreaming();

    p.apply(GenerateSequence.from(0).to(10))
        .apply(
            ParDo.of(
                new DoFn<Long, String>() {
                  @ProcessElement
                  public void processElement(ProcessContext c) throws Exception {
                    c.output(c.element().toString());
                  }
                }))
        .apply(TextIO.write().to(resultPath));

    p.run();
  }
 
Example 4
Source File: Task.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<BigInteger> numbers =
      pipeline.apply(
          Create.of(
              BigInteger.valueOf(10), BigInteger.valueOf(20), BigInteger.valueOf(30),
              BigInteger.valueOf(40), BigInteger.valueOf(50)
          ));

  PCollection<BigInteger> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 5
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that the {@link DataflowRunner} with {@code --templateLocation} throws the appropriate
 * exception when an output file is not writable.
 */
@Test
public void testTemplateRunnerLoggedErrorForFile() throws Exception {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setJobName("TestJobName");
  options.setRunner(DataflowRunner.class);
  options.setTemplateLocation("//bad/path");
  options.setProject("test-project");
  options.setRegion(REGION_ID);
  options.setTempLocation(tmpFolder.getRoot().getPath());
  options.setGcpCredential(new TestCredential());
  options.setPathValidatorClass(NoopPathValidator.class);
  Pipeline p = Pipeline.create(options);

  thrown.expectMessage("Cannot create output file at");
  thrown.expect(RuntimeException.class);
  p.run();
}
 
Example 6
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that the {@link DataflowRunner} with {@code --templateLocation} returns normally when the
 * runner is successfully run with upload_graph experiment turned on. The result template should
 * not contain raw steps and stepsLocation file should be set.
 */
@Test
public void testTemplateRunnerWithUploadGraph() throws Exception {
  File existingFile = tmpFolder.newFile();
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setExperiments(Arrays.asList("upload_graph"));
  options.setJobName("TestJobName");
  options.setGcpCredential(new TestCredential());
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setProject("test-project");
  options.setRegion(REGION_ID);
  options.setRunner(DataflowRunner.class);
  options.setTemplateLocation(existingFile.getPath());
  options.setTempLocation(tmpFolder.getRoot().getPath());
  Pipeline p = Pipeline.create(options);
  p.apply(Create.of(ImmutableList.of(1)));
  p.run();
  expectedLogs.verifyInfo("Template successfully created");
  ObjectMapper objectMapper = new ObjectMapper();
  JsonNode node = objectMapper.readTree(existingFile);
  assertEquals(0, node.get("steps").size());
  assertNotNull(node.get("stepsLocation"));
}
 
Example 7
Source File: TextToPubsubStream.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
    .apply(
      "Read Text Data",
      TextIO.read()
        .from(options.getInputFilePattern())
        .watchForNewFiles(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
    .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example 8
Source File: TrafficRoutes.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void runTrafficRoutes(TrafficRoutesOptions options) throws IOException {
  // Using ExampleUtils to set up required resources.
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);
  TableReference tableRef = new TableReference();
  tableRef.setProjectId(options.getProject());
  tableRef.setDatasetId(options.getBigQueryDataset());
  tableRef.setTableId(options.getBigQueryTable());

  pipeline
      .apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
      // row... => <station route, station speed> ...
      .apply(ParDo.of(new ExtractStationSpeedFn()))
      // map the incoming data stream into sliding windows.
      .apply(
          Window.into(
              SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration()))
                  .every(Duration.standardMinutes(options.getWindowSlideEvery()))))
      .apply(new TrackSpeed())
      .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatStatsFn.getSchema()));

  // Run the pipeline.
  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example 9
Source File: BeamEnumerableConverter.java    From beam with Apache License 2.0 5 votes vote down vote up
private static PipelineResult limitRun(
    PipelineOptions options,
    BeamRelNode node,
    DoFn<Row, Void> doFn,
    Queue<Row> values,
    int limitCount) {
  options.as(DirectOptions.class).setBlockOnRun(false);
  Pipeline pipeline = Pipeline.create(options);
  PCollection<Row> resultCollection = BeamSqlRelUtils.toPCollection(pipeline, node);
  resultCollection.apply(ParDo.of(doFn));

  PipelineResult result = pipeline.run();

  State state;
  while (true) {
    // Check pipeline state in every second
    state = result.waitUntilFinish(Duration.standardSeconds(1));
    if (state != null && state.isTerminal()) {
      if (PipelineResult.State.FAILED.equals(state)) {
        throw new RuntimeException("Pipeline failed for unknown reason");
      }
      break;
    }

    try {
      if (values.size() >= limitCount) {
        result.cancel();
        break;
      }
    } catch (IOException e) {
      LOG.warn(e.toString());
      break;
    }
  }

  return result;
}
 
Example 10
Source File: TrafficMaxLaneFlow.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void runTrafficMaxLaneFlow(TrafficMaxLaneFlowOptions options) throws IOException {
  // Using ExampleUtils to set up required resources.
  ExampleUtils exampleUtils = new ExampleUtils(options);
  exampleUtils.setup();

  Pipeline pipeline = Pipeline.create(options);
  TableReference tableRef = new TableReference();
  tableRef.setProjectId(options.getProject());
  tableRef.setDatasetId(options.getBigQueryDataset());
  tableRef.setTableId(options.getBigQueryTable());

  pipeline
      .apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
      // row... => <station route, station speed> ...
      .apply(ParDo.of(new ExtractFlowInfoFn()))
      // map the incoming data stream into sliding windows.
      .apply(
          Window.into(
              SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration()))
                  .every(Duration.standardMinutes(options.getWindowSlideEvery()))))
      .apply(new MaxLaneFlow())
      .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatMaxesFn.getSchema()));

  // Run the pipeline.
  PipelineResult result = pipeline.run();

  // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
  exampleUtils.waitToFinish(result);
}
 
Example 11
Source File: Task.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(Create.of("1,2,3,4,5", "6,7,8,9,10"))
      .apply(new ExtractAndMultiplyNumbers())
      .apply(Log.ofElements());

  pipeline.run();
}
 
Example 12
Source File: BigtableToParquet.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Runs a pipeline to export data from a Cloud Bigtable table to Parquet file(s) in GCS.
 *
 * @param options arguments to the pipeline
 */
public static PipelineResult run(Options options) {
  Pipeline pipeline = Pipeline.create(options);
  BigtableIO.Read read =
      BigtableIO.read()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId());

  // Do not validate input fields if it is running as a template.
  if (options.as(DataflowPipelineOptions.class).getTemplateLocation() != null) {
    read = read.withoutValidation();
  }

  /**
   * Steps:
   * 1) Read records from Bigtable.
   * 2) Convert a Bigtable Row to a GenericRecord.
   * 3) Write GenericRecord(s) to GCS in parquet format.
   */
  pipeline
      .apply("Read from Bigtable", read)
      .apply("Transform to Parquet", MapElements.via(new BigtableToParquetFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, BigtableRow.getClassSchema()))
      .apply(
          "Write to Parquet in GCS",
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(BigtableRow.getClassSchema()))
              .to(options.getOutputDirectory())
              .withPrefix(options.getFilenamePrefix())
              .withSuffix(".parquet")
              .withNumShards(options.getNumShards()));

  return pipeline.run();
}
 
Example 13
Source File: MapReduce.java    From nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function for the MR BEAM program.
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = PipelineOptionsFactory.create().as(NemoPipelineOptions.class);
  options.setRunner(NemoPipelineRunner.class);
  options.setJobName("MapReduce");

  final Pipeline p = Pipeline.create(options);
  final PCollection<String> result = GenericSourceSink.read(p, inputFilePath)
      .apply(MapElements.<String, KV<String, Long>>via(new SimpleFunction<String, KV<String, Long>>() {
        @Override
        public KV<String, Long> apply(final String line) {
          final String[] words = line.split(" +");
          final String documentId = words[0] + "#" + words[1];
          final Long count = Long.parseLong(words[2]);
          return KV.of(documentId, count);
        }
      }))
      .apply(GroupByKey.<String, Long>create())
      .apply(Combine.<String, Long, Long>groupedValues(Sum.ofLongs()))
      .apply(MapElements.<KV<String, Long>, String>via(new SimpleFunction<KV<String, Long>, String>() {
        @Override
        public String apply(final KV<String, Long> kv) {
          return kv.getKey() + ": " + kv.getValue();
        }
      }));
  GenericSourceSink.write(result, outputFilePath);
  p.run();
}
 
Example 14
Source File: IdentifyPrivateVariants.java    From dataflow-java with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException, GeneralSecurityException {
  // Register the options so that they show up via --help
  PipelineOptionsFactory.register(Options.class);
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  // Option validation is not yet automatic, we make an explicit call here.
  Options.Methods.validateOptions(options);

  // Set up the prototype request and auth.
  StreamVariantsRequest prototype = StreamVariantsRequest.newBuilder(
      CallSetNamesOptions.Methods.getRequestPrototype(options))
      // In this case, we do not want responses containing a subset of calls, we want all of them.
      .clearCallSetIds()
      .build();
  OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options);

  ImmutableSet<String> callSetIds = ImmutableSet.<String>builder()
      .addAll(CallSetNamesOptions.Methods.getCallSetIds(options))
      .build();
  LOG.info("The pipeline will identify and write to Cloud Storage variants "
      + "private to " + callSetIds.size() + " genomes with callSetIds: " + callSetIds);
  if (options.getIdentifyVariantsWithoutCalls()) {
    LOG.info("* The pipeline will also identify variants with no callsets. *");
  }

  List<StreamVariantsRequest> shardRequests =
      options.isAllReferences() ? ShardUtils.getVariantRequests(prototype,
          ShardUtils.SexChromosomeFilter.INCLUDE_XY, options.getBasesPerShard(), auth)
          : ShardUtils.getVariantRequests(prototype, options.getBasesPerShard(),
              options.getReferences());

  Pipeline p = Pipeline.create(options);
  PCollection<Variant> variants = p.begin()
      .apply(Create.of(shardRequests))
      .apply(new VariantStreamer(auth, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS))
      .apply(ParDo.of(new PrivateVariantsFilterFn(callSetIds,
          options.getIdentifyVariantsWithoutCalls())));

  variants.apply("FormatResults", ParDo.of(new DoFn<Variant, String>() {
    @ProcessElement
    public void processElement(ProcessContext c) {
      Variant v = c.element();
      c.output(Joiner.on("\t").join(v.getId(),
          v.getReferenceName(),
          v.getStart(),
          v.getEnd(),
          v.getReferenceBases(),
          Joiner.on(",").join(v.getAlternateBasesList())
          ));
    }
  }))
  .apply(TextIO.write().to(options.getOutput()));

  p.run();
}
 
Example 15
Source File: CsvToElasticsearch.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
private static PipelineResult run(CsvToElasticsearchOptions options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  // Register the coder for pipeline
  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
      FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);

  // Throw error if containsHeaders is true and a schema or Udf is also set.
  if (options.getContainsHeaders()) {
    checkArgument(
        options.getJavascriptTextTransformGcsPath() == null
            && options.getJsonSchemaPath() == null,
        "Cannot parse file containing headers with UDF or Json schema.");
  }

  // Throw error if only one retry configuration parameter is set.
  if (options.getMaxRetryAttempts() != null || options.getMaxRetryDuration() != null) {
    checkArgument(
        options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null,
        "To specify retry configuration both max attempts and max duration must be set.");
  }

  /*
   * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
   *        2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
   *        3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
   *        3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
   */
  PCollectionTuple convertedCsvLines =
      pipeline
          /*
           * Step 1: Read CSV file(s) from Cloud Storage using {@link CsvConverters.ReadCsv}.
           */
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .build())
          /*
           * Step 2: Convert lines to Elasticsearch document.
           */
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .setUdfOutputTag(PROCESSING_OUT)
                  .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
                  .build());
  /*
   * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
   */
  convertedCsvLines
      .get(PROCESSING_OUT)
      .apply(
          "GetJsonDocuments",
          MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload))
      .apply(
          "WriteToElasticsearch",
          WriteToElasticsearch.newBuilder()
              .setOptions(options.as(WriteToElasticsearchOptions.class))
              .build());

  /*
   * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
   */
  convertedCsvLines
      .get(PROCESSING_DEADLETTER_OUT)
      .apply(
          "AddTimestamps",
          WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant()))
      .apply(
          "WriteFailedElementsToBigQuery",
          WriteStringMessageErrors.newBuilder()
              .setErrorRecordsTable(options.getDeadletterTable())
              .setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA)
              .build());

  return pipeline.run();
}
 
Example 16
Source File: ResumeFromCheckpointStreamingTest.java    From beam with Apache License 2.0 4 votes vote down vote up
private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
  KafkaIO.Read<String, Instant> read =
      KafkaIO.<String, Instant>read()
          .withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList())
          .withTopics(Collections.singletonList(TOPIC))
          .withKeyDeserializer(StringDeserializer.class)
          .withValueDeserializer(InstantDeserializer.class)
          .withConsumerConfigUpdates(ImmutableMap.of("auto.offset.reset", "earliest"))
          .withTimestampFn(KV::getValue)
          .withWatermarkFn(
              kv -> {
                // at EOF move WM to infinity.
                String key = kv.getKey();
                Instant instant = kv.getValue();
                return "EOF".equals(key) ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
              });

  TestSparkPipelineOptions options =
      PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
  options.setSparkMaster("local[*]");
  options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
  options.setExpectedAssertions(expectedAssertions);
  options.setRunner(TestSparkRunner.class);
  options.setEnableSparkMetricSinks(false);
  options.setForceStreaming(true);
  options.setCheckpointDir(temporaryFolder.getRoot().getPath());
  // timeout is per execution so it can be injected by the caller.
  if (stopWatermarkOption.isPresent()) {
    options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
  }

  Pipeline p = Pipeline.create(options);

  PCollection<String> expectedCol =
      p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
  PCollectionView<List<String>> view = expectedCol.apply(View.asList());

  PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());

  PCollection<Iterable<String>> grouped =
      kafkaStream
          .apply(Keys.create())
          .apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view))
          .apply(
              Window.<String>into(FixedWindows.of(Duration.millis(500)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .accumulatingFiredPanes()
                  .withAllowedLateness(Duration.ZERO))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create());

  grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));

  return (SparkPipelineResult) p.run();
}
 
Example 17
Source File: Task.java    From beam with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(10, 50, 120, 20, 200, 0));

  TupleTag<Integer> numBelow100Tag = new TupleTag<Integer>() {};
  TupleTag<Integer> numAbove100Tag = new TupleTag<Integer>() {};

  PCollectionTuple outputTuple = applyTransform(numbers, numBelow100Tag, numAbove100Tag);

  outputTuple.get(numBelow100Tag).apply(Log.ofElements("Number <= 100: "));
  outputTuple.get(numAbove100Tag).apply(Log.ofElements("Number > 100: "));

  pipeline.run();
}
 
Example 18
Source File: WindowRuntimeTest.java    From components with Apache License 2.0 4 votes vote down vote up
@Test
public void testSlidingWindow() {

    PipelineOptions options = PipelineOptionsFactory.create();
    options.setRunner(DirectRunner.class);
    final Pipeline p = Pipeline.create(options);

    /*
     * // creation of PCollection with different timestamp PCollection<IndexedRecord>
     */
    List<TimestampedValue<IndexedRecord>> data = Arrays.asList( //
            TimestampedValue.of(irA, new Instant(0L)), //
            TimestampedValue.of(irB, new Instant(0L)), //
            TimestampedValue.of(irC, new Instant(1L)), //
            TimestampedValue.of(irA, new Instant(2L)), //
            TimestampedValue.of(irA, new Instant(2L)), //
            TimestampedValue.of(irB, new Instant(2L)), //
            TimestampedValue.of(irB, new Instant(3L)), //
            TimestampedValue.of(irC, new Instant(3L)), //
            TimestampedValue.of(irA, new Instant(4L)));

    Create.TimestampedValues<IndexedRecord> pt = Create.timestamped(data);
    pt = (Create.TimestampedValues<IndexedRecord>) pt.withCoder(LazyAvroCoder.of());
    PCollection<IndexedRecord> input = p.apply(pt);

    WindowProperties windowProperties = new WindowProperties("window");
    windowProperties.setValue("windowLength", 4);
    windowProperties.setValue("windowSlideLength", 2);
    windowProperties.setValue("windowSession", false);

    WindowRuntime windowRun = new WindowRuntime();
    windowRun.initialize(null, windowProperties);

    PCollection<IndexedRecord> test = windowRun.expand(input);

    PCollection<KV<IndexedRecord, Long>> windowed_counts = test.apply(Count.<IndexedRecord> perElement());

    // window duration: 4 - sliding: 2
    PAssert.that(windowed_counts).containsInAnyOrder( //
            KV.of(irA, 1L), //
            KV.of(irA, 1L), //
            KV.of(irA, 3L), //
            KV.of(irA, 3L), //
            KV.of(irB, 1L), //
            KV.of(irB, 3L), //
            KV.of(irB, 2L), //
            KV.of(irC, 1L), //
            KV.of(irC, 1L), //
            KV.of(irC, 2L));
    p.run();
}
 
Example 19
Source File: PubSubToMongoDB.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {

  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  // Register the coders for pipeline
  CoderRegistry coderRegistry = pipeline.getCoderRegistry();

  coderRegistry.registerCoderForType(
          FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);

  coderRegistry.registerCoderForType(CODER.getEncodedTypeDescriptor(), CODER);

  /*
   * Steps: 1) Read PubSubMessage with attributes from input PubSub subscription.
   *        2) Apply Javascript UDF if provided.
   *        3) Write to MongoDB
   *
   */

  LOG.info("Reading from subscription: " + options.getInputSubscription());

  PCollectionTuple convertedPubsubMessages =
      pipeline
          /*
           * Step #1: Read from a PubSub subscription.
           */
          .apply(
              "Read PubSub Subscription",
              PubsubIO.readMessagesWithAttributes()
                  .fromSubscription(options.getInputSubscription()))
          /*
           * Step #2: Apply Javascript Transform and transform, if provided and transform
           *          the PubsubMessages into Json documents.
           */
          .apply(
              "Apply Javascript UDF",
              PubSubMessageToJsonDocument.newBuilder()
                  .setJavascriptTextTransformFunctionName(
                      options.getJavascriptTextTransformFunctionName())
                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                  .build());

  /*
   * Step #3a: Write Json documents into MongoDB using {@link MongoDbIO.write}.
   */
  convertedPubsubMessages
      .get(TRANSFORM_OUT)
      .apply(
          "Get Json Documents",
          MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload))
      .apply("Parse as BSON Document", ParDo.of(new ParseAsDocumentsFn()))
      .apply(
          "Put to MongoDB",
          MongoDbIO.write()
              .withBatchSize(options.getBatchSize())
              .withUri(String.format("mongodb://%s", options.getMongoDBUri()))
              .withDatabase(options.getDatabase())
              .withCollection(options.getCollection())
              .withIgnoreSSLCertificate(options.getIgnoreSSLCertificate())
              .withMaxConnectionIdleTime(options.getMaxConnectionIdleTime())
              .withOrdered(options.getWithOrdered())
              .withSSLEnabled(options.getSslEnabled())
              .withSSLInvalidHostNameAllowed(options.getWithSSLInvalidHostNameAllowed()));

  /*
   * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
   */
  convertedPubsubMessages
      .get(TRANSFORM_DEADLETTER_OUT)
      .apply(
          "Write Transform Failures To BigQuery",
          ErrorConverters.WritePubsubMessageErrors.newBuilder()
              .setErrorRecordsTable(options.getDeadletterTable())
              .setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA)
              .build());

  // Execute the pipeline and return the result.
  return pipeline.run();
}
 
Example 20
Source File: Task.java    From beam with Apache License 2.0 3 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> sentences =
      pipeline.apply(Create.of("Apache Beam", "Unified Batch and Streaming"));

  PCollection<String> output = applyTransform(sentences);

  output.apply(Log.ofElements());

  pipeline.run();
}