Java Code Examples for org.apache.beam.sdk.Pipeline#apply()

The following examples show how to use org.apache.beam.sdk.Pipeline#apply() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Tests that all reads are consumed by at least one {@link PTransform}. */
@Test
public void testUnconsumedReads() throws IOException {
  DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
  RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
  Pipeline p = buildDataflowPipeline(dataflowOptions);
  p.apply(TextIO.read().from(options.getInput()));
  DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
  final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
  p.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumedSeenAsInput.set(true);
        }
      });
  assertThat(unconsumedSeenAsInput.get(), is(true));
}
 
Example 2
Source File: QueryablePipelineTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void rootTransforms() {
  Pipeline p = Pipeline.create();
  p.apply("UnboundedRead", Read.from(CountingSource.unbounded()))
      .apply(Window.into(FixedWindows.of(Duration.millis(5L))))
      .apply(Count.perElement());
  p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));

  Components components = PipelineTranslation.toProto(p).getComponents();
  QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);

  assertThat(qp.getRootTransforms(), hasSize(2));
  for (PTransformNode rootTransform : qp.getRootTransforms()) {
    assertThat(
        "Root transforms should have no inputs",
        rootTransform.getTransform().getInputsCount(),
        equalTo(0));
    assertThat(
        "Only added source reads to the pipeline",
        rootTransform.getTransform().getSpec().getUrn(),
        equalTo(PTransformTranslation.READ_TRANSFORM_URN));
  }
}
 
Example 3
Source File: FlinkPipelineExecutionEnvironmentTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testTranslationModeNoOverrideWithoutUnboundedSources() {
  boolean[] testArgs = new boolean[] {true, false};
  for (boolean streaming : testArgs) {
    FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
    options.setRunner(FlinkRunner.class);
    options.setStreaming(streaming);

    FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply(GenerateSequence.from(0).to(10));
    flinkEnv.translate(pipeline);

    assertThat(options.isStreaming(), Matchers.is(streaming));
  }
}
 
Example 4
Source File: BeamBigQueryInputStepHandler.java    From hop with Apache License 2.0 5 votes vote down vote up
@Override public void handleStep( ILogChannel log, TransformMeta transformMeta, Map<String, PCollection<HopRow>> stepCollectionMap,
                                  Pipeline pipeline, IRowMeta rowMeta, List<TransformMeta> previousSteps,
                                  PCollection<HopRow> input ) throws HopException {

  // Input handling
  //
  BeamBQInputMeta beamInputMeta = (BeamBQInputMeta) transformMeta.getTransform();

  // Output rows (fields selection)
  //
  IRowMeta outputRowMeta = new RowMeta();
  beamInputMeta.getFields( outputRowMeta, transformMeta.getName(), null, null, pipelineMeta, null );

  BeamBQInputTransform beamInputTransform = new BeamBQInputTransform(
    transformMeta.getName(),
    transformMeta.getName(),
    pipelineMeta.environmentSubstitute( beamInputMeta.getProjectId() ),
    pipelineMeta.environmentSubstitute( beamInputMeta.getDatasetId() ),
    pipelineMeta.environmentSubstitute( beamInputMeta.getTableId() ),
    pipelineMeta.environmentSubstitute( beamInputMeta.getQuery() ),
    JsonRowMeta.toJson( outputRowMeta ),
    transformPluginClasses,
    xpPluginClasses
  );
  PCollection<HopRow> afterInput = pipeline.apply( beamInputTransform );
  stepCollectionMap.put( transformMeta.getName(), afterInput );
  log.logBasic( "Handled transform (BQ INPUT) : " + transformMeta.getName() );

}
 
Example 5
Source File: SparkSimpleFileIOOutputRuntimeTestIT.java    From components with Apache License 2.0 5 votes vote down vote up
/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Category(ValidatesRunner.class)
@Ignore("BEAM-1206")
@Test
public void testBasicDefaults() throws IOException {
    FileSystem fs = FileSystem.get(spark.createHadoopConfiguration());
    String fileSpec = fs.getUri().resolve(new Path(tmp.getRoot().toString(), "basic").toUri()).toString();

    // Configure the component.
    SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
    props.getDatasetProperties().path.setValue(fileSpec);
    props.getDatasetProperties().format.setValue(SimpleFileIOFormat.AVRO);

    // Create the runtime.
    SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
    runtime.initialize(null, props);

    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = p.apply( //
            Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
    input.apply(runtime);

    // And run the test.
    p.run().waitUntilFinish();

    // Check the expected values.
    MiniDfsResource.assertReadFile(fs, fileSpec, "1;one", "2;two");
}
 
Example 6
Source File: TestDataflowRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that if a streaming pipeline crash loops for a non-assertion reason that the test run
 * throws an {@link AssertionError}.
 *
 * <p>This is a known limitation/bug of the runner that it does not distinguish the two modes of
 * failure.
 */
@Test
public void testStreamingPipelineFailsIfException() throws Exception {
  options.setStreaming(true);
  Pipeline pipeline = TestPipeline.create(options);
  PCollection<Integer> pc = pipeline.apply(Create.of(1, 2, 3));
  PAssert.that(pc).containsInAnyOrder(1, 2, 3);

  DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class);
  when(mockJob.getState()).thenReturn(State.RUNNING);
  when(mockJob.getProjectId()).thenReturn("test-project");
  when(mockJob.getJobId()).thenReturn("test-job");
  when(mockJob.waitUntilFinish(any(Duration.class), any(JobMessagesHandler.class)))
      .thenAnswer(
          invocation -> {
            JobMessage message = new JobMessage();
            message.setMessageText("FooException");
            message.setTime(TimeUtil.toCloudTime(Instant.now()));
            message.setMessageImportance("JOB_MESSAGE_ERROR");
            ((JobMessagesHandler) invocation.getArguments()[1]).process(Arrays.asList(message));
            return State.CANCELLED;
          });

  DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class);
  when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob);

  when(mockClient.getJobMetrics(anyString()))
      .thenReturn(generateMockMetricResponse(false /* success */, true /* tentative */));
  TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);

  expectedException.expect(RuntimeException.class);
  runner.run(pipeline, mockRunner);
}
 
Example 7
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * @param contentToIndexNotSkipped
 * @param contentNotToIndexSkipped
 * @param pipeline
 * @param options
 * @return
 */
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
		PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
		Pipeline pipeline, IndexerPipelineOptions options) {
	PCollection<KV<String,Long>> alreadyProcessedDocs = null;
	
	if (!options.getWriteTruncate()) {
		String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
		alreadyProcessedDocs = pipeline
			.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
			.apply(ParDo.of(new GetDocumentHashFn()));

	} else {
		Map<String, Long> map = new HashMap<String,Long>();
		alreadyProcessedDocs = pipeline
			.apply("Create empty side input of Docs",
				Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
	}			
	
	final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =  
		alreadyProcessedDocs.apply(View.<String,Long>asMap());
	
	PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
		.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
		.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
		.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
			.withSideInputs(alreadyProcessedDocsSideInput)
			.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
				TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection	
	
	PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
	PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
	
	// Merge the sets of items that are dupes or skipped
	PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
	
	ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
	return content;
}
 
Example 8
Source File: S3OutputRuntimeTestIT.java    From components with Apache License 2.0 5 votes vote down vote up
@Test
public void testAvro_merge() throws IOException {
    S3DatasetProperties datasetProps = s3.createS3DatasetProperties();
    datasetProps.format.setValue(SimpleFileIOFormat.AVRO);
    S3OutputProperties outputProperties = new S3OutputProperties("out");
    outputProperties.init();
    outputProperties.setDatasetProperties(datasetProps);
    outputProperties.mergeOutput.setValue(true);

    // Create the runtime.
    S3OutputRuntime runtime = new S3OutputRuntime();
    runtime.initialize(null, outputProperties);

    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = p.apply( //
            Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
    input.apply(runtime);

    // And run the test.
    p.run().waitUntilFinish();

    FileSystem s3FileSystem = S3Connection.createFileSystem(datasetProps);
    MiniDfsResource.assertReadAvroFile(s3FileSystem, s3.getS3APath(datasetProps),
            new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))),
            false);
    MiniDfsResource.assertFileNumber(s3FileSystem, s3.getS3APath(datasetProps), 1);

}
 
Example 9
Source File: SparkSimpleFileIOOutputRuntimeTestIT.java    From components with Apache License 2.0 5 votes vote down vote up
@Test
public void testAvro_merge() throws IOException {
    FileSystem fs = FileSystem.get(spark.createHadoopConfiguration());
    String fileSpec = fs.getUri().resolve(new Path(tmp.getRoot().toString(), "output.avro").toUri()).toString();

    // Configure the component.
    SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
    props.getDatasetProperties().path.setValue(fileSpec);
    props.getDatasetProperties().format.setValue(SimpleFileIOFormat.AVRO);
    props.mergeOutput.setValue(true);

    // Create the runtime.
    SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
    runtime.initialize(null, props);

    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = p.apply( //
            Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
    input.apply(runtime);

    // And run the test.
    p.run().waitUntilFinish();

    // Check the expected values.

    MiniDfsResource.assertReadAvroFile(fs, fileSpec,
            new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))),
            false);
    MiniDfsResource.assertFileNumber(fs, fileSpec, 1);
}
 
Example 10
Source File: Task.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<KV<String, String>> citiesToCountries =
      pipeline.apply("Cities and Countries",
          Create.of(
              KV.of("Beijing", "China"),
              KV.of("London", "United Kingdom"),
              KV.of("San Francisco", "United States"),
              KV.of("Singapore", "Singapore"),
              KV.of("Sydney", "Australia")
          ));

  PCollectionView<Map<String, String>> citiesToCountriesView =
      createView(citiesToCountries);

  PCollection<Person> persons =
      pipeline.apply("Persons",
          Create.of(
              new Person("Henry", "Singapore"),
              new Person("Jane", "San Francisco"),
              new Person("Lee", "Beijing"),
              new Person("John", "Sydney"),
              new Person("Alfred", "London")
          ));

  PCollection<Person> output = applyTransform(persons, citiesToCountriesView);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 11
Source File: SpannerGroupWrite.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);

  String instanceId = options.getInstanceId();
  String databaseId = options.getDatabaseId();

  String usersIdFile = options.getSuspiciousUsersFile();

  PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile));

  final Timestamp timestamp = Timestamp.now();

  // [START spanner_dataflow_writegroup]
  PCollection<MutationGroup> mutations = suspiciousUserIds
      .apply(MapElements.via(new SimpleFunction<String, MutationGroup>() {

        @Override
        public MutationGroup apply(String userId) {
          // Immediately block the user.
          Mutation userMutation = Mutation.newUpdateBuilder("Users")
              .set("id").to(userId)
              .set("state").to("BLOCKED")
              .build();
          long generatedId = Hashing.sha1().newHasher()
              .putString(userId, Charsets.UTF_8)
              .putLong(timestamp.getSeconds())
              .putLong(timestamp.getNanos())
              .hash()
              .asLong();

          // Add an entry to pending review requests.
          Mutation pendingReview = Mutation.newInsertOrUpdateBuilder("PendingReviews")
              .set("id").to(generatedId)  // Must be deterministically generated.
              .set("userId").to(userId)
              .set("action").to("REVIEW ACCOUNT")
              .set("note").to("Suspicious activity detected.")
              .build();

          return MutationGroup.create(userMutation, pendingReview);
        }
      }));

  mutations.apply(SpannerIO.write()
      .withInstanceId(instanceId)
      .withDatabaseId(databaseId)
      .grouped());
  // [END spanner_dataflow_writegroup]

  p.run().waitUntilFinish();

}
 
Example 12
Source File: BigQueryReadWriteIT.java    From beam with Apache License 2.0 4 votes vote down vote up
private PCollection<Row> createPCollection(Pipeline pipeline, Row... rows) {
  return pipeline.apply(Create.of(Arrays.asList(rows)).withRowSchema(SOURCE_SCHEMA));
}
 
Example 13
Source File: TestUtils.java    From beam with Apache License 2.0 4 votes vote down vote up
public static <T> PCollection<T> createMockDataset(
    Pipeline pipeline, TypeDescriptor<T> typeDescriptor) {
  return pipeline.apply(Create.empty(typeDescriptor));
}
 
Example 14
Source File: SimpleFileIOOutputErrorTest.java    From components with Apache License 2.0 4 votes vote down vote up
/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Test
public void testUnauthorizedOverwrite() throws IOException, URISyntaxException {
    Path parent = new Path(mini.newFolder().toString());
    Path dst = new Path(parent, "output");
    String fileSpec = mini.getLocalFs().getUri().resolve(dst.toUri()).toString();

    // Write something to the file before trying to run.
    try (OutputStream out = mini.getLocalFs().create(new Path(dst, "part-00000"))) {
        out.write(0);
    }

    // Ensure that the destination is unwritable.
    FileUtil.chmod(dst.toUri().toString(), "000", true);

    // Trying to overwrite an unmodifiable destination throws an exception.
    thrown.expect(TalendRuntimeException.class);
    thrown.expect(hasProperty("code", is(SimpleFileIOErrorCode.OUTPUT_NOT_AUTHORIZED)));
    thrown.expectMessage("Can not write to " + fileSpec
            + ". Please check user permissions or existence of base directory.");

    // Now try using the component.
    try {
        // Configure the component.
        SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
        props.getDatasetProperties().path.setValue(fileSpec);
        props.overwrite.setValue(true);

        // Create the runtime.
        SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
        runtime.initialize(null, props);

        // Use the runtime in a direct pipeline to test.
        final Pipeline p = beam.createPipeline();
        PCollection<IndexedRecord> input = p.apply( //
                Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                        ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
        input.apply(runtime);

        // And run the test.
        runtime.runAtDriver(null);
        p.run().waitUntilFinish();
    } catch (Pipeline.PipelineExecutionException e) {
        if (e.getCause() instanceof TalendRuntimeException)
            throw (TalendRuntimeException) e.getCause();
        throw e;
    }
}
 
Example 15
Source File: Task.java    From beam with Apache License 2.0 4 votes vote down vote up
static PCollection<String> setupPipeline(Pipeline pipeline) {
  return pipeline.apply(Create.of("Hello Beam"));
}
 
Example 16
Source File: Task.java    From beam with Apache License 2.0 3 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));

  PCollection<Integer> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 17
Source File: Task.java    From beam with Apache License 2.0 3 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(10, 30, 50, 70, 90));

  PCollection<Integer> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 18
Source File: Task.java    From beam with Apache License 2.0 3 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> events =
      pipeline.apply(GenerateEvent.everySecond());

  PCollection<Long> output = applyTransform(events);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 19
Source File: Task.java    From beam with Apache License 2.0 3 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(10, 20, 50, 70, 90));

  PCollection<Double> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 20
Source File: Task.java    From beam with Apache License 2.0 3 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));

  PCollection<Integer> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}