Java Code Examples for org.apache.beam.sdk.Pipeline

The following examples show how to use org.apache.beam.sdk.Pipeline. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: beam   Source File: DirectRunnerTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link DoFn} that mutates an output with a good equals() fails in the {@link
 * DirectRunner}.
 */
@Test
public void testMutatingOutputThenTerminateDoFnError() throws Exception {
  Pipeline pipeline = getPipeline();

  pipeline
      .apply(Create.of(42))
      .apply(
          ParDo.of(
              new DoFn<Integer, List<Integer>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  List<Integer> outputList = Arrays.asList(1, 2, 3, 4);
                  c.output(outputList);
                  outputList.set(0, 37);
                }
              }));

  thrown.expect(IllegalMutationException.class);
  thrown.expectMessage("output");
  thrown.expectMessage("must not be mutated");
  pipeline.run();
}
 
Example 2
Source Project: beam   Source File: DataflowPTransformMatchersTest.java    License: Apache License 2.0 6 votes vote down vote up
/** Traverse the pipeline and return the first {@link Combine.GroupedValues} found. */
private static AppliedPTransform<?, ?, ?> getCombineGroupedValuesFrom(TestPipeline pipeline) {
  final AppliedPTransform<?, ?, ?>[] transform = new AppliedPTransform<?, ?, ?>[1];
  pipeline.traverseTopologically(
      new Pipeline.PipelineVisitor.Defaults() {
        @Override
        public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
          if (!node.isRootNode()
              && node.toAppliedPTransform(getPipeline())
                  .getTransform()
                  .getClass()
                  .equals(Combine.GroupedValues.class)) {
            transform[0] = node.toAppliedPTransform(getPipeline());
            return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
          }
          return CompositeBehavior.ENTER_TRANSFORM;
        }
      });
  return transform[0];
}
 
Example 3
Source Project: beam   Source File: Task.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> fruits =
      pipeline.apply("Fruits",
          Create.of("apple", "banana", "cherry")
      );

  PCollection<String> countries =
      pipeline.apply("Countries",
          Create.of("australia", "brazil", "canada")
      );

  PCollection<String> output = applyTransform(fruits, countries);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 4
Source Project: beam   Source File: HadoopFormatIOWriteTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWritingDataFailInvalidKeyType() {

  conf.set(HadoopFormatIO.OUTPUT_DIR, tmpFolder.getRoot().getAbsolutePath());
  List<KV<String, Employee>> data = new ArrayList<>();
  data.add(KV.of("key", new Employee("name", "address")));
  PCollection<KV<String, Employee>> input =
      p.apply("CreateData", Create.of(data))
          .setTypeDescriptor(
              TypeDescriptors.kvs(
                  new TypeDescriptor<String>() {}, new TypeDescriptor<Employee>() {}));

  thrown.expect(Pipeline.PipelineExecutionException.class);
  thrown.expectMessage(String.class.getName());

  input.apply(
      "Write",
      HadoopFormatIO.<String, Employee>write()
          .withConfiguration(conf)
          .withPartitioning()
          .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())));
  p.run().waitUntilFinish();
}
 
Example 5
Source Project: beam   Source File: BeamEnumerableConverter.java    License: Apache License 2.0 6 votes vote down vote up
private static boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}
 
Example 6
Source Project: beam   Source File: CacheTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void shouldCacheTest() {
  SparkPipelineOptions options = createOptions();
  options.setCacheDisabled(true);
  Pipeline pipeline = Pipeline.create(options);

  Values<String> valuesTransform = Create.of("foo", "bar");
  PCollection pCollection = mock(PCollection.class);

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  ctxt.getCacheCandidates().put(pCollection, 2L);

  assertFalse(ctxt.shouldCache(valuesTransform, pCollection));

  options.setCacheDisabled(false);
  assertTrue(ctxt.shouldCache(valuesTransform, pCollection));

  GroupByKey<String, String> gbkTransform = GroupByKey.create();
  assertFalse(ctxt.shouldCache(gbkTransform, pCollection));
}
 
Example 7
Source Project: beam   Source File: DataflowPipelineTranslatorTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testNetworkConfigMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertNull(job.getEnvironment().getWorkerPools().get(0).getNetwork());
}
 
Example 8
Source Project: components   Source File: BigQueryDatasetRuntime.java    License: Apache License 2.0 6 votes vote down vote up
public void getSampleDeprecated(int limit, Consumer<IndexedRecord> consumer) {
    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    // Create an input runtime based on the properties.
    BigQueryInputRuntime inputRuntime = new BigQueryInputRuntime();
    BigQueryInputProperties inputProperties = new BigQueryInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(new BeamJobRuntimeContainer(options), inputProperties);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        PipelineResult pr = p.run();
        pr.waitUntilFinish();
    }
}
 
Example 9
Source Project: beam   Source File: BigQueryTimePartitioningClusteringIT.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testE2EBigQueryTimePartitioning() throws Exception {
  String tableName = "weather_stations_time_partitioned_" + System.currentTimeMillis();

  Pipeline p = Pipeline.create(options);

  p.apply(BigQueryIO.readTableRows().from(options.getBqcInput()))
      .apply(ParDo.of(new KeepStationNumberAndConvertDate()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(String.format("%s.%s", DATASET_NAME, tableName))
              .withTimePartitioning(TIME_PARTITIONING)
              .withSchema(SCHEMA)
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

  p.run().waitUntilFinish();

  bqClient = BigqueryClient.getNewBigquerryClient(options.getAppName());
  Table table = bqClient.tables().get(options.getProject(), DATASET_NAME, tableName).execute();

  Assert.assertEquals(table.getTimePartitioning(), TIME_PARTITIONING);
}
 
Example 10
Source Project: DataflowTemplates   Source File: DatastoreToText.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF, and writes the JSON to TextIO sink.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(ReadJsonEntities.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}
 
Example 11
Source Project: beam   Source File: BatchLoads.java    License: Apache License 2.0 6 votes vote down vote up
private PCollectionView<String> createLoadJobIdPrefixView(Pipeline p) {
  // Create a singleton job ID token at execution time. This will be used as the base for all
  // load jobs issued from this instance of the transform.
  return p.apply("JobIdCreationRoot", Create.of((Void) null))
      .apply(
          "CreateJobId",
          ParDo.of(
              new DoFn<Void, String>() {
                @ProcessElement
                public void process(ProcessContext c) {
                  c.output(
                      String.format(
                          "beam_load_%s_%s",
                          c.getPipelineOptions().getJobName().replaceAll("-", ""),
                          BigQueryHelpers.randomUUIDString()));
                }
              }))
      .apply(View.asSingleton());
}
 
Example 12
Source Project: beam   Source File: Task.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> wordsStartingWithA =
      pipeline.apply("Words starting with A",
          Create.of("apple", "ant", "arrow")
      );

  PCollection<String> wordsStartingWithB =
      pipeline.apply("Words starting with B",
          Create.of("ball", "book", "bow")
      );

  PCollection<String> output = applyTransform(wordsStartingWithA, wordsStartingWithB);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 13
Source Project: components   Source File: ElasticsearchDatasetRuntime.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties: ensure to read only the first batch of documents
    // from the index since we're computing a sample
    ElasticsearchInputRuntime inputRuntime = new ElasticsearchInputRuntime(true);
    ElasticsearchInputProperties inputProperties = new ElasticsearchInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(null, inputProperties);

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}
 
Example 14
Source Project: beam   Source File: QueryablePipelineTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void getEnvironmentWithEnvironment() {
  Pipeline p = Pipeline.create();
  PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));
  longs.apply(WithKeys.of("a")).apply("groupByKey", GroupByKey.create());

  Components components = PipelineTranslation.toProto(p).getComponents();
  QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);

  PTransformNode environmentalRead =
      PipelineNode.pTransform("BoundedRead", components.getTransformsOrThrow("BoundedRead"));
  PTransformNode nonEnvironmentalTransform =
      PipelineNode.pTransform("groupByKey", components.getTransformsOrThrow("groupByKey"));

  assertThat(qp.getEnvironment(environmentalRead).isPresent(), is(true));
  assertThat(
      qp.getEnvironment(environmentalRead).get().getUrn(),
      equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getUrn()));
  assertThat(
      qp.getEnvironment(environmentalRead).get().getPayload(),
      equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getPayload()));
  assertThat(qp.getEnvironment(nonEnvironmentalTransform).isPresent(), is(false));
}
 
Example 15
Source Project: beam   Source File: DataflowRunner.java    License: Apache License 2.0 6 votes vote down vote up
private boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}
 
Example 16
@Test
public void shouldUseTransformOverrides() {
  boolean[] testParameters = {true, false};
  for (boolean streaming : testParameters) {
    FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
    options.setStreaming(streaming);
    options.setRunner(FlinkRunner.class);
    FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
    Pipeline p = Mockito.spy(Pipeline.create(options));

    flinkEnv.translate(p);

    ArgumentCaptor<ImmutableList> captor = ArgumentCaptor.forClass(ImmutableList.class);
    Mockito.verify(p).replaceAll(captor.capture());
    ImmutableList<PTransformOverride> overridesList = captor.getValue();

    assertThat(overridesList.isEmpty(), is(false));
    assertThat(
        overridesList.size(), is(FlinkTransformOverrides.getDefaultOverrides(options).size()));
  }
}
 
Example 17
Source Project: beam   Source File: DirectRunnerTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link DoFn} that mutates its input with a good equals() fails in the {@link
 * DirectRunner}.
 */
@Test
public void testMutatingInputDoFnError() throws Exception {
  Pipeline pipeline = getPipeline();

  pipeline
      .apply(
          Create.of(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))
              .withCoder(ListCoder.of(VarIntCoder.of())))
      .apply(
          ParDo.of(
              new DoFn<List<Integer>, Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  List<Integer> inputList = c.element();
                  inputList.set(0, 37);
                  c.output(12);
                }
              }));

  thrown.expect(IllegalMutationException.class);
  thrown.expectMessage("Input");
  thrown.expectMessage("must not be mutated");
  pipeline.run();
}
 
Example 18
Source Project: beam   Source File: ImpulseEvaluatorFactoryTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testRootProvider() {
  Pipeline p = Pipeline.create();
  PCollection<byte[]> impulseOut = p.apply(Impulse.create());
  // Add a second impulse to demonstrate no crosstalk between applications
  @SuppressWarnings("unused")
  PCollection<byte[]> impulseOutTwo = p.apply(Impulse.create());
  AppliedPTransform<?, ?, ?> impulseApplication = DirectGraphs.getProducer(impulseOut);

  ImpulseRootProvider rootProvider = new ImpulseRootProvider(context);
  when(context.createRootBundle()).thenReturn(bundleFactory.createRootBundle());

  Collection<CommittedBundle<?>> inputs =
      rootProvider.getInitialInputs((AppliedPTransform) impulseApplication, 100);

  assertThat("Only one impulse bundle per application", inputs, hasSize(1));
  assertThat(
      "Only one impulse shard per bundle",
      Iterables.size(inputs.iterator().next().getElements()),
      equalTo(1));
}
 
Example 19
Source Project: beam   Source File: TestDataflowRunnerTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Tests that when a streaming pipeline terminates and doesn't fail due to {@link PAssert} that
 * the {@link TestPipelineOptions#setOnSuccessMatcher(SerializableMatcher) on success matcher} is
 * invoked.
 */
@Test
public void testStreamingOnSuccessMatcherWhenPipelineSucceeds() throws Exception {
  options.setStreaming(true);
  Pipeline p = TestPipeline.create(options);
  PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
  PAssert.that(pc).containsInAnyOrder(1, 2, 3);

  final DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class);
  when(mockJob.getState()).thenReturn(State.DONE);
  when(mockJob.getProjectId()).thenReturn("test-project");
  when(mockJob.getJobId()).thenReturn("test-job");

  DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class);
  when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob);

  TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
  options.as(TestPipelineOptions.class).setOnSuccessMatcher(new TestSuccessMatcher(mockJob, 1));

  when(mockJob.waitUntilFinish(any(Duration.class), any(JobMessagesHandler.class)))
      .thenReturn(State.DONE);

  when(mockClient.getJobMetrics(anyString()))
      .thenReturn(generateMockMetricResponse(true /* success */, true /* tentative */));
  runner.run(p, mockRunner);
}
 
Example 20
Source Project: beam   Source File: UnionTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnion_threeDataSets() {
  execute(
      new TestCase<Integer>() {

        @Override
        public PCollection<Integer> getOutput(Pipeline pipeline) {
          final PCollection<Integer> first = createDataset(pipeline, 1, 2, 3, 4, 5, 6);
          final PCollection<Integer> second = createDataset(pipeline, 7, 8, 9, 10, 11, 12);
          final PCollection<Integer> third = createDataset(pipeline, 13, 14, 15, 16, 17, 18);
          return Union.of(first, second, third).output();
        }

        @Override
        public List<Integer> getUnorderedOutput() {
          return Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18);
        }
      });
}
 
Example 21
Source Project: DataflowTemplates   Source File: FileFormatConversion.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 * @throws RuntimeException thrown if incorrect file formats are passed.
 */
public static PipelineResult run(FileFormatConversionOptions options) {
  String inputFileFormat = options.getInputFileFormat().toUpperCase();
  String outputFileFormat = options.getOutputFileFormat().toUpperCase();

  validFileFormats.put(ValidFileFormats.CSV, "CSV");
  validFileFormats.put(ValidFileFormats.AVRO, "AVRO");
  validFileFormats.put(ValidFileFormats.PARQUET, "PARQUET");

  try {
    if (inputFileFormat.equals(outputFileFormat)) {
      LOG.error("Input and output file format cannot be the same.");
      throw new IOException();
    }
    if (!validFileFormats.containsValue(inputFileFormat)
        || !validFileFormats.containsValue(outputFileFormat)) {
      LOG.error("Invalid input or output file format.");
      throw new IOException();
    }
  } catch (IOException e) {
    throw new RuntimeException("Provide correct input/output file format.");
  }

  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  pipeline.apply(
      inputFileFormat + " to " + outputFileFormat,
      FileFormatConversionFactory.FileFormat.newBuilder()
          .setOptions(options)
          .setInputFileFormat(inputFileFormat)
          .setOutputFileFormat(outputFileFormat)
          .build());

  return pipeline.run();
}
 
Example 22
Source Project: hop   Source File: BeamTimestampStepHandler.java    License: Apache License 2.0 5 votes vote down vote up
@Override public void handleStep( ILogChannel log, TransformMeta transformMeta, Map<String, PCollection<HopRow>> stepCollectionMap,
                                  Pipeline pipeline, IRowMeta rowMeta, List<TransformMeta> previousSteps,
                                  PCollection<HopRow> input ) throws HopException {

  BeamTimestampMeta beamTimestampMeta = (BeamTimestampMeta) transformMeta.getTransform();

  if ( !beamTimestampMeta.isReadingTimestamp() && StringUtils.isNotEmpty( beamTimestampMeta.getFieldName() ) ) {
    if ( rowMeta.searchValueMeta( beamTimestampMeta.getFieldName() ) == null ) {
      throw new HopException( "Please specify a valid field name '" + transformMeta.getName() + "'" );
    }
  }

  PCollection<HopRow> stepPCollection = input.apply( ParDo.of(
    new TimestampFn(
      transformMeta.getName(),
      JsonRowMeta.toJson( rowMeta ),
      pipelineMeta.environmentSubstitute( beamTimestampMeta.getFieldName() ),
      beamTimestampMeta.isReadingTimestamp(),
      transformPluginClasses,
      xpPluginClasses
    ) ) );


  // Save this in the map
  //
  stepCollectionMap.put( transformMeta.getName(), stepPCollection );
  log.logBasic( "Handled transform (TIMESTAMP) : " + transformMeta.getName() + ", gets data from " + previousSteps.size() + " previous transform(s)" );
}
 
Example 23
Source Project: beam   Source File: DatastoreV1Test.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test to ensure that {@link ValueProvider} values are not accessed at pipeline construction time
 * when built with {@link DatastoreV1.Read#withQuery(Query)}.
 */
@Test
public void testRuntimeOptionsNotCalledInApplyQuery() {
  RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class);
  Pipeline pipeline = TestPipeline.create(options);
  pipeline
      .apply(
          DatastoreIO.v1()
              .read()
              .withProjectId(options.getDatastoreProject())
              .withQuery(QUERY)
              .withNamespace(options.getNamespace()))
      .apply(DatastoreIO.v1().write().withProjectId(options.getDatastoreProject()));
}
 
Example 24
Source Project: cloud-bigtable-examples   Source File: PubsubWordCount.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * <p>Creates a dataflow pipeline that creates the following chain:</p>
 * <ol>
 *   <li> Reads from a Cloud Pubsub topic
 *   <li> Window into fixed windows of 1 minute
 *   <li> Applies word count transform
 *   <li> Creates Puts from each of the word counts in the array
 *   <li> Performs a Bigtable Put on the items
 * </ol>
 *
 * @param args Arguments to use to configure the Dataflow Pipeline.  The first three are required
 *   when running via managed resource in Google Cloud Platform.  Those options should be omitted
 *   for LOCAL runs.  The next four arguments are to configure the Bigtable connection. The last
 *   two items are for Cloud Pubsub.
 *        --runner=BlockingDataflowPipelineRunner
 *        --project=[dataflow project] \\
 *        --stagingLocation=gs://[your google storage bucket] \\
 *        --bigtableProjectId=[bigtable project] \\
 *        --bigtableInstanceId=[bigtable instance id] \\
 *        --bigtableTableId=[bigtable tableName]
 *        --inputFile=[file path on GCS]
 *        --pubsubTopic=projects/[project name]/topics/[topic name]
 */

public static void main(String[] args) throws Exception {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  BigtablePubsubOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtablePubsubOptions.class);

  // CloudBigtableTableConfiguration contains the project, instance and table to connect to.
  CloudBigtableTableConfiguration config =
      new CloudBigtableTableConfiguration.Builder()
      .withProjectId(options.getBigtableProjectId())
      .withInstanceId(options.getBigtableInstanceId())
      .withTableId(options.getBigtableTableId())
      .build();

  // In order to cancel the pipelines automatically,
  // DataflowPipelineRunner is forced to be used.
  // Also enables the 2 jobs to run at the same time.
  options.setRunner(DataflowRunner.class);

  options.as(DataflowPipelineOptions.class).setStreaming(true);
  Pipeline p = Pipeline.create(options);

  FixedWindows window = FixedWindows.of(Duration.standardMinutes(options.getWindowSize()));

  p
      .apply(PubsubIO.readStrings().fromTopic(options.getPubsubTopic()))
      .apply(Window.<String> into(window))
      .apply(ParDo.of(new ExtractWordsFn()))
      .apply(Count.<String> perElement())
      .apply(ParDo.of(MUTATION_TRANSFORM))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run().waitUntilFinish();
  // Start a second job to inject messages into a Cloud Pubsub topic
  injectMessages(options);
}
 
Example 25
Source Project: beam   Source File: JoinExamples.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);
  // the following two 'applys' create multiple inputs to our pipeline, one for each
  // of our two input sources.
  PCollection<TableRow> eventsTable =
      p.apply(BigQueryIO.readTableRows().from(GDELT_EVENTS_TABLE));
  PCollection<TableRow> countryCodes = p.apply(BigQueryIO.readTableRows().from(COUNTRY_CODES));
  PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
  formattedResults.apply(TextIO.write().to(options.getOutput()));
  p.run().waitUntilFinish();
}
 
Example 26
public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }
 
Example 27
Source Project: beam   Source File: ParDoTest.java    License: Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void beforeClass() {
  SparkStructuredStreamingPipelineOptions options =
      PipelineOptionsFactory.create().as(SparkStructuredStreamingPipelineOptions.class);
  options.setRunner(SparkStructuredStreamingRunner.class);
  options.setTestMode(true);
  pipeline = Pipeline.create(options);
}
 
Example 28
Source Project: beam   Source File: DataflowRunnerTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGcsUploadBufferSizeIsUnsetForBatchWhenDefault() throws IOException {
  DataflowPipelineOptions batchOptions = buildPipelineOptions();
  batchOptions.setRunner(DataflowRunner.class);
  Pipeline.create(batchOptions);
  assertNull(batchOptions.getGcsUploadBufferSizeBytes());
}
 
Example 29
Source Project: beam   Source File: PCollectionListTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testExpandWithDuplicates() {
  Pipeline p = TestPipeline.create();
  PCollection<Long> createOne = p.apply("CreateOne", Create.of(1L, 2L, 3L));

  PCollectionList<Long> list = PCollectionList.of(createOne).and(createOne).and(createOne);
  assertThat(list.expand().values(), containsInAnyOrder(createOne, createOne, createOne));
}
 
Example 30
Source Project: kettle-beam   Source File: BeamBigQueryInputStepHandler.java    License: Apache License 2.0 5 votes vote down vote up
@Override public void handleStep( LogChannelInterface log, StepMeta stepMeta, Map<String, PCollection<KettleRow>> stepCollectionMap,
                                  Pipeline pipeline, RowMetaInterface rowMeta, List<StepMeta> previousSteps,
                                  PCollection<KettleRow> input ) throws KettleException {

  // Input handling
  //
  BeamBQInputMeta beamInputMeta = (BeamBQInputMeta) stepMeta.getStepMetaInterface();

  // Output rows (fields selection)
  //
  RowMetaInterface outputRowMeta = new RowMeta();
  beamInputMeta.getFields( outputRowMeta, stepMeta.getName(), null, null, transMeta, null, null );

  BeamBQInputTransform beamInputTransform = new BeamBQInputTransform(
    stepMeta.getName(),
    stepMeta.getName(),
    transMeta.environmentSubstitute( beamInputMeta.getProjectId() ),
    transMeta.environmentSubstitute( beamInputMeta.getDatasetId() ),
    transMeta.environmentSubstitute( beamInputMeta.getTableId() ),
    transMeta.environmentSubstitute( beamInputMeta.getQuery() ),
    JsonRowMeta.toJson( outputRowMeta ),
    stepPluginClasses,
    xpPluginClasses
  );
  PCollection<KettleRow> afterInput = pipeline.apply( beamInputTransform );
  stepCollectionMap.put( stepMeta.getName(), afterInput );
  log.logBasic( "Handled step (BQ INPUT) : " + stepMeta.getName() );

}