Java Code Examples for org.apache.beam.sdk.Pipeline#traverseTopologically()

The following examples show how to use org.apache.beam.sdk.Pipeline#traverseTopologically() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TrackStreamingSourcesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testTrackSingle() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> emptyStream =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  p.apply(emptyStream).apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}
 
Example 2
Source File: BeamEnumerableConverter.java    From beam with Apache License 2.0 6 votes vote down vote up
private static boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}
 
Example 3
Source File: DataflowRunner.java    From beam with Apache License 2.0 6 votes vote down vote up
private boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}
 
Example 4
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Tests that all reads are consumed by at least one {@link PTransform}. */
@Test
public void testUnconsumedReads() throws IOException {
  DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
  RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
  Pipeline p = buildDataflowPipeline(dataflowOptions);
  p.apply(TextIO.read().from(options.getInput()));
  DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
  final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
  p.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumedSeenAsInput.set(true);
        }
      });
  assertThat(unconsumedSeenAsInput.get(), is(true));
}
 
Example 5
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testDiskSizeGbConfig() throws IOException {
  final Integer diskSizeGb = 1234;

  DataflowPipelineOptions options = buildPipelineOptions();
  options.setDiskSizeGb(diskSizeGb);

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertEquals(diskSizeGb, job.getEnvironment().getWorkerPools().get(0).getDiskSizeGb());
}
 
Example 6
Source File: UnconsumedReads.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void ensureAllReadsConsumed(Pipeline pipeline) {
  final Set<PCollection<?>> unconsumed = new HashSet<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumed.removeAll(node.getInputs().values());
        }

        @Override
        public void visitValue(PValue value, Node producer) {
          String urn = PTransformTranslation.urnForTransformOrNull(producer.getTransform());
          if (PTransformTranslation.READ_TRANSFORM_URN.equals(urn)) {
            unconsumed.add((PCollection<?>) value);
          }
        }
      });
  int i = 0;
  for (PCollection<?> unconsumedPCollection : unconsumed) {
    consume(unconsumedPCollection, i);
    i++;
  }
}
 
Example 7
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSubnetworkConfigMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertNull(job.getEnvironment().getWorkerPools().get(0).getSubnetwork());
}
 
Example 8
Source File: TrackStreamingSourcesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testTrackFlattened() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> queueStream1 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();
  CreateStream<Integer> queueStream2 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  PCollection<Integer> pcol1 = p.apply(queueStream1);
  PCollection<Integer> pcol2 = p.apply(queueStream2);
  PCollection<Integer> flattened =
      PCollectionList.of(pcol1).and(pcol2).apply(Flatten.pCollections());
  flattened.apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}
 
Example 9
Source File: PipelineInit.java    From component-runtime with Apache License 2.0 5 votes vote down vote up
public static void lazyStart(final JobStateAware.State jobState, final Supplier<DIPipeline> pipelineSupplier) {
    final AtomicBoolean pipelineStarted = jobState.getPipelineStarted();
    if (!pipelineStarted.get() && pipelineStarted.compareAndSet(false, true)) {
        final Pipeline pipeline = pipelineSupplier.get();
        final TransformCounter counter = new TransformCounter();
        pipeline.traverseTopologically(counter);
        if (counter.transforms.get() > 0) {
            final PipelineResult result = pipeline.run();
            new Thread("talend-component-kit-di-pipeline-awaiter") {

                @Override
                public void run() {
                    log.debug("Starting to watch beam pipeline");
                    try {
                        result.waitUntilFinish();
                    } finally {
                        final PipelineResult.State state = result.getState();
                        log.debug("Exited pipeline with state {}", state.name());
                        if (state.isTerminal()) {
                            log.info("Beam pipeline ended");
                        } else {
                            log.debug("Beam pipeline ended by interruption");
                        }
                        jobState.getPipelineDone().complete(true);
                    }
                }
            }.start();
        } else {
            jobState.getPipelineDone().complete(true);
            log.warn("A pipeline was created but not transform were found, is your job correctly configured?");
        }
    }
}
 
Example 10
Source File: NemoPipelineRunner.java    From nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Method to run the Pipeline.
 * @param pipeline the Pipeline to run.
 * @return The result of the pipeline.
 */
public NemoPipelineResult run(final Pipeline pipeline) {
  final DAGBuilder builder = new DAGBuilder<>();
  final NemoPipelineVisitor nemoPipelineVisitor = new NemoPipelineVisitor(builder, nemoPipelineOptions);
  pipeline.traverseTopologically(nemoPipelineVisitor);
  final DAG dag = builder.build();
  final NemoPipelineResult nemoPipelineResult = new NemoPipelineResult();
  JobLauncher.launchDAG(dag);
  return nemoPipelineResult;
}
 
Example 11
Source File: CacheTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Test checks how the cache candidates map is populated by the runner when evaluating the
 * pipeline.
 */
@Test
public void cacheCandidatesUpdaterTest() {
  SparkPipelineOptions options = createOptions();
  Pipeline pipeline = Pipeline.create(options);
  PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));

  // First use of pCollection.
  pCollection.apply(Count.globally());
  // Second use of pCollection.
  PCollectionView<List<String>> view = pCollection.apply(View.asList());

  // Internally View.asList() creates a PCollection that underlies the PCollectionView, that
  // PCollection should not be cached as the SparkRunner does not access that PCollection to
  // access the PCollectionView.
  pipeline
      .apply(Create.of("foo", "baz"))
      .apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext processContext) {
                      if (processContext.sideInput(view).contains(processContext.element())) {
                        processContext.output(processContext.element());
                      }
                    }
                  })
              .withSideInputs(view));

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  SparkRunner.CacheVisitor cacheVisitor =
      new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
  pipeline.traverseTopologically(cacheVisitor);
  assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
  assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count());
}
 
Example 12
Source File: NemoRunner.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Method to run the Pipeline.
 *
 * @param pipeline the Pipeline to run.
 * @return The result of the pipeline.
 */
public NemoPipelineResult run(final Pipeline pipeline) {
  final PipelineVisitor pipelineVisitor = new PipelineVisitor(pipeline, nemoPipelineOptions);
  pipeline.traverseTopologically(pipelineVisitor);
  final NemoPipelineResult nemoPipelineResult = new NemoPipelineResult();
  CompletableFuture.runAsync(() ->
    JobLauncher.launchDAG(pipelineVisitor.getConvertedPipeline(), nemoPipelineOptions.getJobName()))
    .thenRun(nemoPipelineResult::setJobDone);
  return nemoPipelineResult;
}
 
Example 13
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testScalingAlgorithmMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  // Autoscaling settings are always set.
  assertNull(
      job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings().getAlgorithm());
  assertEquals(
      0,
      job.getEnvironment()
          .getWorkerPools()
          .get(0)
          .getAutoscalingSettings()
          .getMaxNumWorkers()
          .intValue());
}
 
Example 14
Source File: SparkRunner.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Visit the pipeline to determine the translation mode (batch/streaming). */
private void detectTranslationMode(Pipeline pipeline) {
  TranslationModeDetector detector = new TranslationModeDetector();
  pipeline.traverseTopologically(detector);
  if (detector.getTranslationMode().equals(TranslationMode.STREAMING)) {
    // set streaming mode if it's a streaming pipeline
    this.mOptions.setStreaming(true);
  }
}
 
Example 15
Source File: DirectRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public DirectPipelineResult run(Pipeline pipeline) {
  try {
    options =
        MAPPER
            .readValue(MAPPER.writeValueAsBytes(options), PipelineOptions.class)
            .as(DirectOptions.class);
  } catch (IOException e) {
    throw new IllegalArgumentException(
        "PipelineOptions specified failed to serialize to JSON.", e);
  }

  pipeline.replaceAll(defaultTransformOverrides());
  MetricsEnvironment.setMetricsSupported(true);
  try {
    DirectGraphVisitor graphVisitor = new DirectGraphVisitor();
    pipeline.traverseTopologically(graphVisitor);

    @SuppressWarnings("rawtypes")
    KeyedPValueTrackingVisitor keyedPValueVisitor = KeyedPValueTrackingVisitor.create();
    pipeline.traverseTopologically(keyedPValueVisitor);

    DisplayDataValidator.validatePipeline(pipeline);
    DisplayDataValidator.validateOptions(options);

    ExecutorService metricsPool =
        Executors.newCachedThreadPool(
            new ThreadFactoryBuilder()
                .setThreadFactory(MoreExecutors.platformThreadFactory())
                .setDaemon(false) // otherwise you say you want to leak, please don't!
                .setNameFormat("direct-metrics-counter-committer")
                .build());
    DirectGraph graph = graphVisitor.getGraph();
    EvaluationContext context =
        EvaluationContext.create(
            clockSupplier.get(),
            Enforcement.bundleFactoryFor(enabledEnforcements, graph),
            graph,
            keyedPValueVisitor.getKeyedPValues(),
            metricsPool);

    TransformEvaluatorRegistry registry =
        TransformEvaluatorRegistry.javaSdkNativeRegistry(context, options);
    PipelineExecutor executor =
        ExecutorServiceParallelExecutor.create(
            options.getTargetParallelism(),
            registry,
            Enforcement.defaultModelEnforcements(enabledEnforcements),
            context,
            metricsPool);
    executor.start(graph, RootProviderRegistry.javaNativeRegistry(context, options));

    DirectPipelineResult result = new DirectPipelineResult(executor, context);
    if (options.isBlockOnRun()) {
      try {
        result.waitUntilFinish();
      } catch (UserCodeException userException) {
        throw new PipelineExecutionException(userException.getCause());
      } catch (Throwable t) {
        if (t instanceof RuntimeException) {
          throw (RuntimeException) t;
        }
        throw new RuntimeException(t);
      }
    }
    return result;
  } finally {
    MetricsEnvironment.setMetricsSupported(false);
  }
}
 
Example 16
Source File: SparkRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Evaluator that update/populate the cache candidates. */
public static void updateCacheCandidates(
    Pipeline pipeline, SparkPipelineTranslator translator, EvaluationContext evaluationContext) {
  CacheVisitor cacheVisitor = new CacheVisitor(translator, evaluationContext);
  pipeline.traverseTopologically(cacheVisitor);
}
 
Example 17
Source File: PViewToIdMapper.java    From beam with Apache License 2.0 4 votes vote down vote up
public static Map<PValue, String> buildIdMap(Pipeline pipeline) {
  final PViewToIdMapper mapper = new PViewToIdMapper();
  pipeline.traverseTopologically(mapper);
  return mapper.getIdMap();
}
 
Example 18
Source File: DisplayDataEvaluator.java    From beam with Apache License 2.0 4 votes vote down vote up
private static Set<DisplayData> displayDataForPipeline(Pipeline pipeline, PTransform<?, ?> root) {
  PrimitiveDisplayDataPTransformVisitor visitor = new PrimitiveDisplayDataPTransformVisitor(root);
  pipeline.traverseTopologically(visitor);
  return visitor.getPrimitivesDisplayData();
}
 
Example 19
Source File: FlinkPipelineTranslator.java    From beam with Apache License 2.0 2 votes vote down vote up
/**
 * Translates the pipeline by passing this class as a visitor.
 *
 * @param pipeline The pipeline to be translated
 */
public void translate(Pipeline pipeline) {
  pipeline.traverseTopologically(this);
}
 
Example 20
Source File: Twister2PipelineTranslator.java    From twister2 with Apache License 2.0 2 votes vote down vote up
/**
 * Translates the pipeline by passing this class as a visitor.
 *
 * @param pipeline The pipeline to be translated
 */
public void translate(Pipeline pipeline) {
  pipeline.traverseTopologically(this);
}