Java Code Examples for org.apache.beam.sdk.Pipeline#traverseTopologically()

The following examples show how to use org.apache.beam.sdk.Pipeline#traverseTopologically() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTrackSingle() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> emptyStream =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  p.apply(emptyStream).apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}

Example 2

Source File: BeamEnumerableConverter.java From beam with Apache License 2.0

6 votes

private static boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}

Example 3

Source File: DataflowRunner.java From beam with Apache License 2.0

6 votes

private boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}

Example 4

Source File: DataflowRunnerTest.java From beam with Apache License 2.0

6 votes

/** Tests that all reads are consumed by at least one {@link PTransform}. */
@Test
public void testUnconsumedReads() throws IOException {
  DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
  RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
  Pipeline p = buildDataflowPipeline(dataflowOptions);
  p.apply(TextIO.read().from(options.getInput()));
  DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
  final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
  p.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumedSeenAsInput.set(true);
        }
      });
  assertThat(unconsumedSeenAsInput.get(), is(true));
}

Example 5

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

6 votes

@Test
public void testDiskSizeGbConfig() throws IOException {
  final Integer diskSizeGb = 1234;

  DataflowPipelineOptions options = buildPipelineOptions();
  options.setDiskSizeGb(diskSizeGb);

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertEquals(diskSizeGb, job.getEnvironment().getWorkerPools().get(0).getDiskSizeGb());
}

Example 6

Source File: UnconsumedReads.java From beam with Apache License 2.0

6 votes

public static void ensureAllReadsConsumed(Pipeline pipeline) {
  final Set<PCollection<?>> unconsumed = new HashSet<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumed.removeAll(node.getInputs().values());
        }

        @Override
        public void visitValue(PValue value, Node producer) {
          String urn = PTransformTranslation.urnForTransformOrNull(producer.getTransform());
          if (PTransformTranslation.READ_TRANSFORM_URN.equals(urn)) {
            unconsumed.add((PCollection<?>) value);
          }
        }
      });
  int i = 0;
  for (PCollection<?> unconsumedPCollection : unconsumed) {
    consume(unconsumedPCollection, i);
    i++;
  }
}

Example 7

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

6 votes

@Test
public void testSubnetworkConfigMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertNull(job.getEnvironment().getWorkerPools().get(0).getSubnetwork());
}

Example 8

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTrackFlattened() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> queueStream1 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();
  CreateStream<Integer> queueStream2 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  PCollection<Integer> pcol1 = p.apply(queueStream1);
  PCollection<Integer> pcol2 = p.apply(queueStream2);
  PCollection<Integer> flattened =
      PCollectionList.of(pcol1).and(pcol2).apply(Flatten.pCollections());
  flattened.apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}

Example 9

Source File: PipelineInit.java From component-runtime with Apache License 2.0

5 votes

public static void lazyStart(final JobStateAware.State jobState, final Supplier<DIPipeline> pipelineSupplier) {
    final AtomicBoolean pipelineStarted = jobState.getPipelineStarted();
    if (!pipelineStarted.get() && pipelineStarted.compareAndSet(false, true)) {
        final Pipeline pipeline = pipelineSupplier.get();
        final TransformCounter counter = new TransformCounter();
        pipeline.traverseTopologically(counter);
        if (counter.transforms.get() > 0) {
            final PipelineResult result = pipeline.run();
            new Thread("talend-component-kit-di-pipeline-awaiter") {

                @Override
                public void run() {
                    log.debug("Starting to watch beam pipeline");
                    try {
                        result.waitUntilFinish();
                    } finally {
                        final PipelineResult.State state = result.getState();
                        log.debug("Exited pipeline with state {}", state.name());
                        if (state.isTerminal()) {
                            log.info("Beam pipeline ended");
                        } else {
                            log.debug("Beam pipeline ended by interruption");
                        }
                        jobState.getPipelineDone().complete(true);
                    }
                }
            }.start();
        } else {
            jobState.getPipelineDone().complete(true);
            log.warn("A pipeline was created but not transform were found, is your job correctly configured?");
        }
    }
}

Example 10

Source File: NemoPipelineRunner.java From nemo with Apache License 2.0

5 votes

/**
 * Method to run the Pipeline.
 * @param pipeline the Pipeline to run.
 * @return The result of the pipeline.
 */
public NemoPipelineResult run(final Pipeline pipeline) {
  final DAGBuilder builder = new DAGBuilder<>();
  final NemoPipelineVisitor nemoPipelineVisitor = new NemoPipelineVisitor(builder, nemoPipelineOptions);
  pipeline.traverseTopologically(nemoPipelineVisitor);
  final DAG dag = builder.build();
  final NemoPipelineResult nemoPipelineResult = new NemoPipelineResult();
  JobLauncher.launchDAG(dag);
  return nemoPipelineResult;
}

Example 11

Source File: CacheTest.java From beam with Apache License 2.0

5 votes

/**
 * Test checks how the cache candidates map is populated by the runner when evaluating the
 * pipeline.
 */
@Test
public void cacheCandidatesUpdaterTest() {
  SparkPipelineOptions options = createOptions();
  Pipeline pipeline = Pipeline.create(options);
  PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));

  // First use of pCollection.
  pCollection.apply(Count.globally());
  // Second use of pCollection.
  PCollectionView<List<String>> view = pCollection.apply(View.asList());

  // Internally View.asList() creates a PCollection that underlies the PCollectionView, that
  // PCollection should not be cached as the SparkRunner does not access that PCollection to
  // access the PCollectionView.
  pipeline
      .apply(Create.of("foo", "baz"))
      .apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext processContext) {
                      if (processContext.sideInput(view).contains(processContext.element())) {
                        processContext.output(processContext.element());
                      }
                    }
                  })
              .withSideInputs(view));

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  SparkRunner.CacheVisitor cacheVisitor =
      new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
  pipeline.traverseTopologically(cacheVisitor);
  assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
  assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count());
}

Example 12

Source File: NemoRunner.java From incubator-nemo with Apache License 2.0

5 votes

/**
 * Method to run the Pipeline.
 *
 * @param pipeline the Pipeline to run.
 * @return The result of the pipeline.
 */
public NemoPipelineResult run(final Pipeline pipeline) {
  final PipelineVisitor pipelineVisitor = new PipelineVisitor(pipeline, nemoPipelineOptions);
  pipeline.traverseTopologically(pipelineVisitor);
  final NemoPipelineResult nemoPipelineResult = new NemoPipelineResult();
  CompletableFuture.runAsync(() ->
    JobLauncher.launchDAG(pipelineVisitor.getConvertedPipeline(), nemoPipelineOptions.getJobName()))
    .thenRun(nemoPipelineResult::setJobDone);
  return nemoPipelineResult;
}

Example 13

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

5 votes

@Test
public void testScalingAlgorithmMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  // Autoscaling settings are always set.
  assertNull(
      job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings().getAlgorithm());
  assertEquals(
      0,
      job.getEnvironment()
          .getWorkerPools()
          .get(0)
          .getAutoscalingSettings()
          .getMaxNumWorkers()
          .intValue());
}

Example 14

Source File: SparkRunner.java From beam with Apache License 2.0

5 votes

/** Visit the pipeline to determine the translation mode (batch/streaming). */
private void detectTranslationMode(Pipeline pipeline) {
  TranslationModeDetector detector = new TranslationModeDetector();
  pipeline.traverseTopologically(detector);
  if (detector.getTranslationMode().equals(TranslationMode.STREAMING)) {
    // set streaming mode if it's a streaming pipeline
    this.mOptions.setStreaming(true);
  }
}

Example 15

Source File: DirectRunner.java From beam with Apache License 2.0

4 votes

@Override
public DirectPipelineResult run(Pipeline pipeline) {
  try {
    options =
        MAPPER
            .readValue(MAPPER.writeValueAsBytes(options), PipelineOptions.class)
            .as(DirectOptions.class);
  } catch (IOException e) {
    throw new IllegalArgumentException(
        "PipelineOptions specified failed to serialize to JSON.", e);
  }

  pipeline.replaceAll(defaultTransformOverrides());
  MetricsEnvironment.setMetricsSupported(true);
  try {
    DirectGraphVisitor graphVisitor = new DirectGraphVisitor();
    pipeline.traverseTopologically(graphVisitor);

    @SuppressWarnings("rawtypes")
    KeyedPValueTrackingVisitor keyedPValueVisitor = KeyedPValueTrackingVisitor.create();
    pipeline.traverseTopologically(keyedPValueVisitor);

    DisplayDataValidator.validatePipeline(pipeline);
    DisplayDataValidator.validateOptions(options);

    ExecutorService metricsPool =
        Executors.newCachedThreadPool(
            new ThreadFactoryBuilder()
                .setThreadFactory(MoreExecutors.platformThreadFactory())
                .setDaemon(false) // otherwise you say you want to leak, please don't!
                .setNameFormat("direct-metrics-counter-committer")
                .build());
    DirectGraph graph = graphVisitor.getGraph();
    EvaluationContext context =
        EvaluationContext.create(
            clockSupplier.get(),
            Enforcement.bundleFactoryFor(enabledEnforcements, graph),
            graph,
            keyedPValueVisitor.getKeyedPValues(),
            metricsPool);

    TransformEvaluatorRegistry registry =
        TransformEvaluatorRegistry.javaSdkNativeRegistry(context, options);
    PipelineExecutor executor =
        ExecutorServiceParallelExecutor.create(
            options.getTargetParallelism(),
            registry,
            Enforcement.defaultModelEnforcements(enabledEnforcements),
            context,
            metricsPool);
    executor.start(graph, RootProviderRegistry.javaNativeRegistry(context, options));

    DirectPipelineResult result = new DirectPipelineResult(executor, context);
    if (options.isBlockOnRun()) {
      try {
        result.waitUntilFinish();
      } catch (UserCodeException userException) {
        throw new PipelineExecutionException(userException.getCause());
      } catch (Throwable t) {
        if (t instanceof RuntimeException) {
          throw (RuntimeException) t;
        }
        throw new RuntimeException(t);
      }
    }
    return result;
  } finally {
    MetricsEnvironment.setMetricsSupported(false);
  }
}

Example 16

Source File: SparkRunner.java From beam with Apache License 2.0

4 votes

/** Evaluator that update/populate the cache candidates. */
public static void updateCacheCandidates(
    Pipeline pipeline, SparkPipelineTranslator translator, EvaluationContext evaluationContext) {
  CacheVisitor cacheVisitor = new CacheVisitor(translator, evaluationContext);
  pipeline.traverseTopologically(cacheVisitor);
}

Example 17

Source File: PViewToIdMapper.java From beam with Apache License 2.0

4 votes

public static Map<PValue, String> buildIdMap(Pipeline pipeline) {
  final PViewToIdMapper mapper = new PViewToIdMapper();
  pipeline.traverseTopologically(mapper);
  return mapper.getIdMap();
}

Example 18

Source File: DisplayDataEvaluator.java From beam with Apache License 2.0

4 votes

private static Set<DisplayData> displayDataForPipeline(Pipeline pipeline, PTransform<?, ?> root) {
  PrimitiveDisplayDataPTransformVisitor visitor = new PrimitiveDisplayDataPTransformVisitor(root);
  pipeline.traverseTopologically(visitor);
  return visitor.getPrimitivesDisplayData();
}

Example 19

Source File: FlinkPipelineTranslator.java From beam with Apache License 2.0

2 votes

/**
 * Translates the pipeline by passing this class as a visitor.
 *
 * @param pipeline The pipeline to be translated
 */
public void translate(Pipeline pipeline) {
  pipeline.traverseTopologically(this);
}

Example 20

Source File: Twister2PipelineTranslator.java From twister2 with Apache License 2.0

2 votes

/**
 * Translates the pipeline by passing this class as a visitor.
 *
 * @param pipeline The pipeline to be translated
 */
public void translate(Pipeline pipeline) {
  pipeline.traverseTopologically(this);
}