org.apache.beam.sdk.Pipeline Java Examples

The following examples show how to use org.apache.beam.sdk.Pipeline. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ElasticsearchDatasetRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties: ensure to read only the first batch of documents
    // from the index since we're computing a sample
    ElasticsearchInputRuntime inputRuntime = new ElasticsearchInputRuntime(true);
    ElasticsearchInputProperties inputProperties = new ElasticsearchInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(null, inputProperties);

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}
 
Example #2
Source File: HadoopFormatIOWriteTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWritingDataFailInvalidKeyType() {

  conf.set(HadoopFormatIO.OUTPUT_DIR, tmpFolder.getRoot().getAbsolutePath());
  List<KV<String, Employee>> data = new ArrayList<>();
  data.add(KV.of("key", new Employee("name", "address")));
  PCollection<KV<String, Employee>> input =
      p.apply("CreateData", Create.of(data))
          .setTypeDescriptor(
              TypeDescriptors.kvs(
                  new TypeDescriptor<String>() {}, new TypeDescriptor<Employee>() {}));

  thrown.expect(Pipeline.PipelineExecutionException.class);
  thrown.expectMessage(String.class.getName());

  input.apply(
      "Write",
      HadoopFormatIO.<String, Employee>write()
          .withConfiguration(conf)
          .withPartitioning()
          .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())));
  p.run().waitUntilFinish();
}
 
Example #3
Source File: TestDataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that when a streaming pipeline terminates and doesn't fail due to {@link PAssert} that
 * the {@link TestPipelineOptions#setOnSuccessMatcher(SerializableMatcher) on success matcher} is
 * invoked.
 */
@Test
public void testStreamingOnSuccessMatcherWhenPipelineSucceeds() throws Exception {
  options.setStreaming(true);
  Pipeline p = TestPipeline.create(options);
  PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
  PAssert.that(pc).containsInAnyOrder(1, 2, 3);

  final DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class);
  when(mockJob.getState()).thenReturn(State.DONE);
  when(mockJob.getProjectId()).thenReturn("test-project");
  when(mockJob.getJobId()).thenReturn("test-job");

  DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class);
  when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob);

  TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
  options.as(TestPipelineOptions.class).setOnSuccessMatcher(new TestSuccessMatcher(mockJob, 1));

  when(mockJob.waitUntilFinish(any(Duration.class), any(JobMessagesHandler.class)))
      .thenReturn(State.DONE);

  when(mockClient.getJobMetrics(anyString()))
      .thenReturn(generateMockMetricResponse(true /* success */, true /* tentative */));
  runner.run(p, mockRunner);
}
 
Example #4
Source File: Task.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> fruits =
      pipeline.apply("Fruits",
          Create.of("apple", "banana", "cherry")
      );

  PCollection<String> countries =
      pipeline.apply("Countries",
          Create.of("australia", "brazil", "canada")
      );

  PCollection<String> output = applyTransform(fruits, countries);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example #5
Source File: DataflowPTransformMatchersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Traverse the pipeline and return the first {@link Combine.GroupedValues} found. */
private static AppliedPTransform<?, ?, ?> getCombineGroupedValuesFrom(TestPipeline pipeline) {
  final AppliedPTransform<?, ?, ?>[] transform = new AppliedPTransform<?, ?, ?>[1];
  pipeline.traverseTopologically(
      new Pipeline.PipelineVisitor.Defaults() {
        @Override
        public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
          if (!node.isRootNode()
              && node.toAppliedPTransform(getPipeline())
                  .getTransform()
                  .getClass()
                  .equals(Combine.GroupedValues.class)) {
            transform[0] = node.toAppliedPTransform(getPipeline());
            return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
          }
          return CompositeBehavior.ENTER_TRANSFORM;
        }
      });
  return transform[0];
}
 
Example #6
Source File: ImpulseEvaluatorFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testRootProvider() {
  Pipeline p = Pipeline.create();
  PCollection<byte[]> impulseOut = p.apply(Impulse.create());
  // Add a second impulse to demonstrate no crosstalk between applications
  @SuppressWarnings("unused")
  PCollection<byte[]> impulseOutTwo = p.apply(Impulse.create());
  AppliedPTransform<?, ?, ?> impulseApplication = DirectGraphs.getProducer(impulseOut);

  ImpulseRootProvider rootProvider = new ImpulseRootProvider(context);
  when(context.createRootBundle()).thenReturn(bundleFactory.createRootBundle());

  Collection<CommittedBundle<?>> inputs =
      rootProvider.getInitialInputs((AppliedPTransform) impulseApplication, 100);

  assertThat("Only one impulse bundle per application", inputs, hasSize(1));
  assertThat(
      "Only one impulse shard per bundle",
      Iterables.size(inputs.iterator().next().getElements()),
      equalTo(1));
}
 
Example #7
Source File: UnionTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnion_threeDataSets() {
  execute(
      new TestCase<Integer>() {

        @Override
        public PCollection<Integer> getOutput(Pipeline pipeline) {
          final PCollection<Integer> first = createDataset(pipeline, 1, 2, 3, 4, 5, 6);
          final PCollection<Integer> second = createDataset(pipeline, 7, 8, 9, 10, 11, 12);
          final PCollection<Integer> third = createDataset(pipeline, 13, 14, 15, 16, 17, 18);
          return Union.of(first, second, third).output();
        }

        @Override
        public List<Integer> getUnorderedOutput() {
          return Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18);
        }
      });
}
 
Example #8
Source File: BeamEnumerableConverter.java    From beam with Apache License 2.0 6 votes vote down vote up
private static boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}
 
Example #9
Source File: CacheTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldCacheTest() {
  SparkPipelineOptions options = createOptions();
  options.setCacheDisabled(true);
  Pipeline pipeline = Pipeline.create(options);

  Values<String> valuesTransform = Create.of("foo", "bar");
  PCollection pCollection = mock(PCollection.class);

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  ctxt.getCacheCandidates().put(pCollection, 2L);

  assertFalse(ctxt.shouldCache(valuesTransform, pCollection));

  options.setCacheDisabled(false);
  assertTrue(ctxt.shouldCache(valuesTransform, pCollection));

  GroupByKey<String, String> gbkTransform = GroupByKey.create();
  assertFalse(ctxt.shouldCache(gbkTransform, pCollection));
}
 
Example #10
Source File: DirectRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link DoFn} that mutates its input with a good equals() fails in the {@link
 * DirectRunner}.
 */
@Test
public void testMutatingInputDoFnError() throws Exception {
  Pipeline pipeline = getPipeline();

  pipeline
      .apply(
          Create.of(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))
              .withCoder(ListCoder.of(VarIntCoder.of())))
      .apply(
          ParDo.of(
              new DoFn<List<Integer>, Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  List<Integer> inputList = c.element();
                  inputList.set(0, 37);
                  c.output(12);
                }
              }));

  thrown.expect(IllegalMutationException.class);
  thrown.expectMessage("Input");
  thrown.expectMessage("must not be mutated");
  pipeline.run();
}
 
Example #11
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testNetworkConfigMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertNull(job.getEnvironment().getWorkerPools().get(0).getNetwork());
}
 
Example #12
Source File: DataflowRunner.java    From beam with Apache License 2.0 6 votes vote down vote up
private boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}
 
Example #13
Source File: FlinkPipelineExecutionEnvironmentTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldUseTransformOverrides() {
  boolean[] testParameters = {true, false};
  for (boolean streaming : testParameters) {
    FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
    options.setStreaming(streaming);
    options.setRunner(FlinkRunner.class);
    FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
    Pipeline p = Mockito.spy(Pipeline.create(options));

    flinkEnv.translate(p);

    ArgumentCaptor<ImmutableList> captor = ArgumentCaptor.forClass(ImmutableList.class);
    Mockito.verify(p).replaceAll(captor.capture());
    ImmutableList<PTransformOverride> overridesList = captor.getValue();

    assertThat(overridesList.isEmpty(), is(false));
    assertThat(
        overridesList.size(), is(FlinkTransformOverrides.getDefaultOverrides(options).size()));
  }
}
 
Example #14
Source File: BigQueryDatasetRuntime.java    From components with Apache License 2.0 6 votes vote down vote up
public void getSampleDeprecated(int limit, Consumer<IndexedRecord> consumer) {
    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    // Create an input runtime based on the properties.
    BigQueryInputRuntime inputRuntime = new BigQueryInputRuntime();
    BigQueryInputProperties inputProperties = new BigQueryInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(new BeamJobRuntimeContainer(options), inputProperties);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        PipelineResult pr = p.run();
        pr.waitUntilFinish();
    }
}
 
Example #15
Source File: BigQueryTimePartitioningClusteringIT.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testE2EBigQueryTimePartitioning() throws Exception {
  String tableName = "weather_stations_time_partitioned_" + System.currentTimeMillis();

  Pipeline p = Pipeline.create(options);

  p.apply(BigQueryIO.readTableRows().from(options.getBqcInput()))
      .apply(ParDo.of(new KeepStationNumberAndConvertDate()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(String.format("%s.%s", DATASET_NAME, tableName))
              .withTimePartitioning(TIME_PARTITIONING)
              .withSchema(SCHEMA)
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

  p.run().waitUntilFinish();

  bqClient = BigqueryClient.getNewBigquerryClient(options.getAppName());
  Table table = bqClient.tables().get(options.getProject(), DATASET_NAME, tableName).execute();

  Assert.assertEquals(table.getTimePartitioning(), TIME_PARTITIONING);
}
 
Example #16
Source File: QueryablePipelineTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void getEnvironmentWithEnvironment() {
  Pipeline p = Pipeline.create();
  PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));
  longs.apply(WithKeys.of("a")).apply("groupByKey", GroupByKey.create());

  Components components = PipelineTranslation.toProto(p).getComponents();
  QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);

  PTransformNode environmentalRead =
      PipelineNode.pTransform("BoundedRead", components.getTransformsOrThrow("BoundedRead"));
  PTransformNode nonEnvironmentalTransform =
      PipelineNode.pTransform("groupByKey", components.getTransformsOrThrow("groupByKey"));

  assertThat(qp.getEnvironment(environmentalRead).isPresent(), is(true));
  assertThat(
      qp.getEnvironment(environmentalRead).get().getUrn(),
      equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getUrn()));
  assertThat(
      qp.getEnvironment(environmentalRead).get().getPayload(),
      equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getPayload()));
  assertThat(qp.getEnvironment(nonEnvironmentalTransform).isPresent(), is(false));
}
 
Example #17
Source File: DirectRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link DoFn} that mutates an output with a good equals() fails in the {@link
 * DirectRunner}.
 */
@Test
public void testMutatingOutputThenTerminateDoFnError() throws Exception {
  Pipeline pipeline = getPipeline();

  pipeline
      .apply(Create.of(42))
      .apply(
          ParDo.of(
              new DoFn<Integer, List<Integer>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  List<Integer> outputList = Arrays.asList(1, 2, 3, 4);
                  c.output(outputList);
                  outputList.set(0, 37);
                }
              }));

  thrown.expect(IllegalMutationException.class);
  thrown.expectMessage("output");
  thrown.expectMessage("must not be mutated");
  pipeline.run();
}
 
Example #18
Source File: Task.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> wordsStartingWithA =
      pipeline.apply("Words starting with A",
          Create.of("apple", "ant", "arrow")
      );

  PCollection<String> wordsStartingWithB =
      pipeline.apply("Words starting with B",
          Create.of("ball", "book", "bow")
      );

  PCollection<String> output = applyTransform(wordsStartingWithA, wordsStartingWithB);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example #19
Source File: DatastoreToText.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF, and writes the JSON to TextIO sink.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(ReadJsonEntities.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}
 
Example #20
Source File: BatchLoads.java    From beam with Apache License 2.0 6 votes vote down vote up
private PCollectionView<String> createLoadJobIdPrefixView(Pipeline p) {
  // Create a singleton job ID token at execution time. This will be used as the base for all
  // load jobs issued from this instance of the transform.
  return p.apply("JobIdCreationRoot", Create.of((Void) null))
      .apply(
          "CreateJobId",
          ParDo.of(
              new DoFn<Void, String>() {
                @ProcessElement
                public void process(ProcessContext c) {
                  c.output(
                      String.format(
                          "beam_load_%s_%s",
                          c.getPipelineOptions().getJobName().replaceAll("-", ""),
                          BigQueryHelpers.randomUUIDString()));
                }
              }))
      .apply(View.asSingleton());
}
 
Example #21
Source File: DirectRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void splitsInputs() {
  Pipeline p = getPipeline();
  PCollection<Long> longs = p.apply(Read.from(MustSplitSource.of(CountingSource.upTo(3))));

  PAssert.that(longs).containsInAnyOrder(0L, 1L, 2L);
  p.run();
}
 
Example #22
Source File: KafkaDatasetRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
/**
 * @param limit the maximum number of records to return.
 * @param consumer a callback that will be applied to each sampled record. This callback should throw a
 * {@link org.talend.daikon.exception.TalendRuntimeException} if there was an error processing the record. Kafka is
 * a unbounded source, have to set time out to stop reading, 1 second as the time out for get Sample, no matter if
 * it get sample or not.
 */
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties.
    KafkaInputPTransformRuntime inputRuntime = new KafkaInputPTransformRuntime();
    KafkaInputProperties inputProperties = new KafkaInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(dataset);
    inputProperties.useMaxReadTime.setValue(true);
    inputProperties.maxReadTime.setValue(1000l);
    inputProperties.autoOffsetReset.setValue(KafkaInputProperties.OffsetType.EARLIEST);
    // TODO: BEAM-1847: Enable both stopping conditions when they can be set, and remove Sample transform from job.
    // inputProperties.useMaxNumRecords.setValue(true);
    // inputProperties.maxNumRecords.setValue(Long.valueOf(limit));
    inputRuntime.initialize(null, inputProperties);

    // Create a pipeline using the input component to get records.
    PipelineOptions options = PipelineOptionsFactory.create();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}
 
Example #23
Source File: FlinkSavepointTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private void restoreFromSavepointLegacy(Pipeline pipeline, String savepointDir)
    throws ExecutionException, InterruptedException {
  JobGraph jobGraph = getJobGraph(pipeline);
  SavepointRestoreSettings savepointSettings = SavepointRestoreSettings.forPath(savepointDir);
  jobGraph.setSavepointRestoreSettings(savepointSettings);
  flinkCluster.submitJob(jobGraph).get();
}
 
Example #24
Source File: JdbcAvroJob.java    From dbeam with Apache License 2.0 5 votes vote down vote up
public JdbcAvroJob(
    final PipelineOptions pipelineOptions,
    final Pipeline pipeline,
    final JdbcExportArgs jdbcExportArgs,
    final String output) {
  this.pipelineOptions = pipelineOptions;
  this.pipeline = pipeline;
  this.jdbcExportArgs = jdbcExportArgs;
  this.output = output;
  Preconditions.checkArgument(
      this.output != null && this.output.length() > 0, "'output' must be defined");
}
 
Example #25
Source File: LoadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
LoadTest(String[] args, Class<OptionsT> testOptions, String metricsNamespace) throws IOException {
  this.metricsNamespace = metricsNamespace;
  this.runtimeMonitor = new TimeMonitor<>(metricsNamespace, "runtime");
  this.options = LoadTestOptions.readFromArgs(args, testOptions);
  this.sourceOptions = fromJsonString(options.getSourceOptions(), SyntheticSourceOptions.class);
  this.pipeline = Pipeline.create(options);
  this.runner = getRunnerName(options.getRunner().getName());
  settings =
      InfluxDBSettings.builder()
          .withHost(options.getInfluxHost())
          .withDatabase(options.getInfluxDatabase())
          .withMeasurement(options.getInfluxMeasurement())
          .get();
}
 
Example #26
Source File: SparkPortableExecutionTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 120_000)
public void testExecStageWithMultipleOutputs() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(CrashingRunner.class);
  options
      .as(PortablePipelineOptions.class)
      .setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED);
  Pipeline pipeline = Pipeline.create(options);
  PCollection<KV<String, String>> a =
      pipeline
          .apply("impulse", Impulse.create())
          .apply("A", ParDo.of(new DoFnWithSideEffect<>("A")));
  PCollection<KV<String, String>> b = a.apply("B", ParDo.of(new DoFnWithSideEffect<>("B")));
  PCollection<KV<String, String>> c = a.apply("C", ParDo.of(new DoFnWithSideEffect<>("C")));
  // Use GBKs to force re-computation of executable stage unless cached.
  b.apply(GroupByKey.create());
  c.apply(GroupByKey.create());
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline);
  JobInvocation jobInvocation =
      SparkJobInvoker.createJobInvocation(
          "testExecStageWithMultipleOutputs",
          "testExecStageWithMultipleOutputsRetrievalToken",
          sparkJobExecutor,
          pipelineProto,
          options.as(SparkPipelineOptions.class));
  jobInvocation.start();
  Assert.assertEquals(Enum.DONE, jobInvocation.getState());
}
 
Example #27
Source File: PCollection.java    From beam with Apache License 2.0 5 votes vote down vote up
/** <b><i>For internal use only; no backwards-compatibility guarantees.</i></b> */
@Internal
public static <T> PCollection<T> createPrimitiveOutputInternal(
    Pipeline pipeline,
    WindowingStrategy<?, ?> windowingStrategy,
    IsBounded isBounded,
    @Nullable Coder<T> coder) {
  PCollection<T> res = new PCollection<>(pipeline, windowingStrategy, isBounded);
  if (coder != null) {
    res.setCoder(coder);
  }
  return res;
}
 
Example #28
Source File: IsmSideInputReaderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testIsmReaderReferenceCaching() throws Exception {
  Coder<WindowedValue<Long>> valueCoder =
      WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
  final WindowedValue<Long> element = valueInGlobalWindow(42L);
  final PCollectionView<Long> view =
      Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asSingleton());

  final Source source =
      initInputFile(
          fromValues(Arrays.asList(element)),
          IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
  final Source emptySource =
      initInputFile(
          fromValues(Arrays.asList()),
          IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));

  final IsmSideInputReader reader =
      sideInputReader(view.getTagInternal().getId(), source, emptySource);

  assertTrue(reader.tagToIsmReaderMap.containsKey(view.getTagInternal()));
  assertEquals(1, reader.tagToIsmReaderMap.get(view.getTagInternal()).size());
  assertEquals(
      FileSystems.matchSingleFileSpec(getString(source.getSpec(), WorkerPropertyNames.FILENAME))
          .resourceId(),
      reader.tagToIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
  assertTrue(reader.tagToEmptyIsmReaderMap.containsKey(view.getTagInternal()));
  assertEquals(1, reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).size());
  assertEquals(
      FileSystems.matchSingleFileSpec(
              getString(emptySource.getSpec(), WorkerPropertyNames.FILENAME))
          .resourceId(),
      reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
}
 
Example #29
Source File: DocumentationExamplesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void metricsAndAccumulatorsSection() {
  final PipelineOptions options = PipelineOptionsFactory.create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> dataset = pipeline.apply(Create.of("a", "x"));

  PCollection<String> flatMapped =
      FlatMap.named("FlatMap1")
          .of(dataset)
          .using(
              (String value, Collector<String> context) -> {
                context.getCounter("my-counter").increment();
                context.collect(value);
              })
          .output();

  PCollection<String> mapped =
      MapElements.named("MapThem")
          .of(dataset)
          .using(
              (value, context) -> {
                // use simple counter
                context.getCounter("my-counter").increment();

                return value.toLowerCase();
              })
          .output();
}
 
Example #30
Source File: Broadcast.java    From nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function for the BEAM program.
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(NemoPipelineRunner.class);

  final Pipeline p = Pipeline.create(options);
  final PCollection<String> elemCollection = GenericSourceSink.read(p, inputFilePath);
  final PCollectionView<Iterable<String>> allCollection = elemCollection.apply(View.<String>asIterable());

  final PCollection<String> result = elemCollection.apply(ParDo.of(new DoFn<String, String>() {
        @ProcessElement
        public void processElement(final ProcessContext c) {
          final String line = c.element();
          final Iterable<String> all = c.sideInput(allCollection);
          final Optional<String> appended = StreamSupport.stream(all.spliterator(), false)
              .reduce((l, r) -> l + '\n' + r);
          if (appended.isPresent()) {
            c.output("line: " + line + "\n" + appended.get());
          } else {
            c.output("error");
          }
        }
      }).withSideInputs(allCollection)
  );

  GenericSourceSink.write(result, outputFilePath);
  p.run();
}