Java Code Examples for org.apache.beam.sdk.Pipeline#replaceAll()

The following examples show how to use org.apache.beam.sdk.Pipeline#replaceAll() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ConfigGeneratorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSamzaLocalExecutionEnvironmentConfig() {
  SamzaPipelineOptions options = PipelineOptionsFactory.create().as(SamzaPipelineOptions.class);
  options.setJobName("TestEnvConfig");
  options.setRunner(SamzaRunner.class);
  options.setSamzaExecutionEnvironment(SamzaExecutionEnvironment.LOCAL);

  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply(Create.of(1, 2, 3)).apply(Sum.integersGlobally());

  pipeline.replaceAll(SamzaTransformOverrides.getDefaultOverrides());

  final Map<PValue, String> idMap = PViewToIdMapper.buildIdMap(pipeline);
  final ConfigBuilder configBuilder = new ConfigBuilder(options);
  SamzaPipelineTranslator.createConfig(pipeline, options, idMap, configBuilder);
  final Config config = configBuilder.build();

  assertTrue(
      Maps.difference(config, ConfigBuilder.localRunConfig()).entriesOnlyOnRight().isEmpty());
}
 
Example 2
Source File: Twister2Runner.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PipelineResult run(Pipeline pipeline) {
  // create a worker and pass in the pipeline and then do the translation
  Twister2PipelineExecutionEnvironment env = new Twister2PipelineExecutionEnvironment(options);
  LOG.info("Translating pipeline to Twister2 program.");
  pipeline.replaceAll(getDefaultOverrides());
  env.translate(pipeline);
  setupSystem(options);

  Config config = ResourceAllocator.loadConfig(new HashMap<>());

  JobConfig jobConfig = new JobConfig();
  jobConfig.put(SIDEINPUTS, extractNames(env.getSideInputs()));
  jobConfig.put(LEAVES, extractNames(env.getLeaves()));
  jobConfig.put(GRAPH, env.getTSetGraph());

  int workers = options.getParallelism();
  Twister2Job twister2Job =
      Twister2Job.newBuilder()
          .setJobName(options.getJobName())
          .setWorkerClass(BeamBatchWorker.class)
          .addComputeResource(options.getWorkerCPUs(), options.getRamMegaBytes(), workers)
          .setConfig(jobConfig)
          .build();
  Twister2JobState jobState = Twister2Submitter.submitJob(twister2Job, config);

  Twister2PipelineResult result = new Twister2PipelineResult();
  // TODO: Need to fix the check for "RUNNING" once fix for this is done on Twister2 end.
  if (jobState.getJobstate() == DriverJobState.FAILED
      || jobState.getJobstate() == DriverJobState.RUNNING) {
    throw new RuntimeException("Pipeline execution failed", jobState.getCause());
  } else {
    result.setState(PipelineResult.State.DONE);
  }
  return result;
}
 
Example 3
Source File: Twister2Runner.java    From beam with Apache License 2.0 5 votes vote down vote up
public PipelineResult runTest(Pipeline pipeline) {
  // create a worker and pass in the pipeline and then do the translation
  Twister2PipelineExecutionEnvironment env = new Twister2PipelineExecutionEnvironment(options);
  LOG.info("Translating pipeline to Twister2 program.");
  pipeline.replaceAll(getDefaultOverrides());
  env.translate(pipeline);
  setupSystemTest(options);
  Map configMap = new HashMap();
  configMap.put(SIDEINPUTS, extractNames(env.getSideInputs()));
  configMap.put(LEAVES, extractNames(env.getLeaves()));
  configMap.put(GRAPH, env.getTSetGraph());
  configMap.put("twister2.network.buffer.size", 32000);
  configMap.put("twister2.network.sendBuffer.count", 1);
  Config config = ResourceAllocator.loadConfig(configMap);

  JobConfig jobConfig = new JobConfig();

  int workers = options.getParallelism();
  Twister2Job twister2Job =
      Twister2Job.newBuilder()
          .setJobName(options.getJobName())
          .setWorkerClass(BeamBatchWorker.class)
          .addComputeResource(options.getWorkerCPUs(), options.getRamMegaBytes(), workers)
          .setConfig(jobConfig)
          .build();
  Twister2JobState jobState = LocalSubmitter.submitJob(twister2Job, config);

  Twister2PipelineResult result = new Twister2PipelineResult();
  // TODO: Need to fix the check for "RUNNING" once fix for this is done on Twister2 end.
  if (jobState.getJobstate() == DriverJobState.FAILED
      || jobState.getJobstate() == DriverJobState.RUNNING) {
    throw new RuntimeException("Pipeline execution failed", jobState.getCause());
  } else {
    result.setState(PipelineResult.State.DONE);
  }
  return result;
}
 
Example 4
Source File: FlinkPipelineExecutionEnvironment.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Depending on if the job is a Streaming or a Batch one, this method creates the necessary
 * execution environment and pipeline translator, and translates the {@link
 * org.apache.beam.sdk.values.PCollection} program into a {@link
 * org.apache.flink.api.java.DataSet} or {@link
 * org.apache.flink.streaming.api.datastream.DataStream} one.
 */
public void translate(Pipeline pipeline) {
  this.flinkBatchEnv = null;
  this.flinkStreamEnv = null;

  final boolean hasUnboundedOutput =
      PipelineTranslationModeOptimizer.hasUnboundedOutput(pipeline);
  if (hasUnboundedOutput) {
    LOG.info("Found unbounded PCollection. Switching to streaming execution.");
    options.setStreaming(true);
  }

  // Staged files need to be set before initializing the execution environments
  prepareFilesToStageForRemoteClusterExecution(options);

  FlinkPipelineTranslator translator;
  if (options.isStreaming()) {
    this.flinkStreamEnv =
        FlinkExecutionEnvironments.createStreamExecutionEnvironment(
            options, options.getFilesToStage());
    if (hasUnboundedOutput && !flinkStreamEnv.getCheckpointConfig().isCheckpointingEnabled()) {
      LOG.warn(
          "UnboundedSources present which rely on checkpointing, but checkpointing is disabled.");
    }
    translator = new FlinkStreamingPipelineTranslator(flinkStreamEnv, options);
  } else {
    this.flinkBatchEnv =
        FlinkExecutionEnvironments.createBatchExecutionEnvironment(
            options, options.getFilesToStage());
    translator = new FlinkBatchPipelineTranslator(flinkBatchEnv, options);
  }

  // Transform replacements need to receive the finalized PipelineOptions
  // including execution mode (batch/streaming) and parallelism.
  pipeline.replaceAll(FlinkTransformOverrides.getDefaultOverrides(options));

  translator.translate(pipeline);
}
 
Example 5
Source File: ConfigGeneratorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testBeamStoreConfig() {
  SamzaPipelineOptions options = PipelineOptionsFactory.create().as(SamzaPipelineOptions.class);
  options.setJobName("TestStoreConfig");
  options.setRunner(SamzaRunner.class);

  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply(Create.of(1, 2, 3)).apply(Sum.integersGlobally());

  pipeline.replaceAll(SamzaTransformOverrides.getDefaultOverrides());

  final Map<PValue, String> idMap = PViewToIdMapper.buildIdMap(pipeline);
  final ConfigBuilder configBuilder = new ConfigBuilder(options);
  SamzaPipelineTranslator.createConfig(pipeline, options, idMap, configBuilder);
  final Config config = configBuilder.build();

  assertEquals(
      RocksDbKeyValueStorageEngineFactory.class.getName(),
      config.get("stores.beamStore.factory"));
  assertEquals("byteArraySerde", config.get("stores.beamStore.key.serde"));
  assertEquals("byteSerde", config.get("stores.beamStore.msg.serde"));
  assertNull(config.get("stores.beamStore.changelog"));

  options.setStateDurable(true);
  SamzaPipelineTranslator.createConfig(pipeline, options, idMap, configBuilder);
  final Config config2 = configBuilder.build();
  assertEquals(
      "TestStoreConfig-1-beamStore-changelog", config2.get("stores.beamStore.changelog"));
}
 
Example 6
Source File: ConfigGeneratorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSamzaYarnExecutionEnvironmentConfig() {
  final String yarnPackagePath = "yarn.package.path";
  SamzaPipelineOptions options = PipelineOptionsFactory.create().as(SamzaPipelineOptions.class);
  options.setJobName("TestEnvConfig");
  options.setRunner(SamzaRunner.class);
  options.setSamzaExecutionEnvironment(SamzaExecutionEnvironment.YARN);
  options.setConfigOverride(
      ImmutableMap.<String, String>builder()
          .put(
              yarnPackagePath,
              "file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz")
          .build());

  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply(Create.of(1, 2, 3)).apply(Sum.integersGlobally());

  pipeline.replaceAll(SamzaTransformOverrides.getDefaultOverrides());

  final Map<PValue, String> idMap = PViewToIdMapper.buildIdMap(pipeline);
  final ConfigBuilder configBuilder = new ConfigBuilder(options);
  SamzaPipelineTranslator.createConfig(pipeline, options, idMap, configBuilder);
  try {
    Config config = configBuilder.build();
    assertEquals(config.get(APP_RUNNER_CLASS), RemoteApplicationRunner.class.getName());
    assertEquals(config.get(JOB_FACTORY_CLASS), YarnJobFactory.class.getName());
  } catch (IllegalArgumentException e) {
    throw new AssertionError(
        String.format(
            "Failed to validate correct configs for %s samza execution environment",
            SamzaExecutionEnvironment.YARN),
        e);
  }
}
 
Example 7
Source File: ConfigGeneratorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSamzaStandAloneExecutionEnvironmentConfig() {
  SamzaPipelineOptions options = PipelineOptionsFactory.create().as(SamzaPipelineOptions.class);
  options.setJobName("TestEnvConfig");
  options.setRunner(SamzaRunner.class);
  options.setSamzaExecutionEnvironment(SamzaExecutionEnvironment.STANDALONE);
  options.setConfigOverride(
      ImmutableMap.<String, String>builder().put(ZkConfig.ZK_CONNECT, "localhost:2181").build());

  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply(Create.of(1, 2, 3)).apply(Sum.integersGlobally());

  pipeline.replaceAll(SamzaTransformOverrides.getDefaultOverrides());

  final Map<PValue, String> idMap = PViewToIdMapper.buildIdMap(pipeline);
  final ConfigBuilder configBuilder = new ConfigBuilder(options);
  SamzaPipelineTranslator.createConfig(pipeline, options, idMap, configBuilder);
  try {
    Config config = configBuilder.build();
    assertEquals(config.get(APP_RUNNER_CLASS), LocalApplicationRunner.class.getName());
    assertEquals(
        config.get(JobCoordinatorConfig.JOB_COORDINATOR_FACTORY),
        ZkJobCoordinatorFactory.class.getName());
  } catch (IllegalArgumentException e) {
    throw new AssertionError(
        String.format(
            "Failed to validate correct configs for %s samza execution environment",
            SamzaExecutionEnvironment.STANDALONE),
        e);
  }
}
 
Example 8
Source File: DataflowRunner.java    From beam with Apache License 2.0 5 votes vote down vote up
@VisibleForTesting
protected void replaceTransforms(Pipeline pipeline) {
  boolean streaming = options.isStreaming() || containsUnboundedPCollection(pipeline);
  // Ensure all outputs of all reads are consumed before potentially replacing any
  // Read PTransforms
  UnconsumedReads.ensureAllReadsConsumed(pipeline);
  pipeline.replaceAll(getOverrides(streaming));
}
 
Example 9
Source File: JetRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
private void normalize(Pipeline pipeline) {
  pipeline.replaceAll(getDefaultOverrides());
  UnconsumedReads.ensureAllReadsConsumed(pipeline);
}
 
Example 10
Source File: PipelineTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
public static void replaceTransforms(Pipeline pipeline, StreamingOptions options) {
  pipeline.replaceAll(SparkTransformOverrides.getDefaultOverrides(options.isStreaming()));
}
 
Example 11
Source File: SparkRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public SparkPipelineResult run(final Pipeline pipeline) {
  LOG.info("Executing pipeline using the SparkRunner.");

  final SparkPipelineResult result;
  final Future<?> startPipeline;

  final SparkPipelineTranslator translator;

  final ExecutorService executorService = Executors.newSingleThreadExecutor();

  MetricsEnvironment.setMetricsSupported(true);

  // visit the pipeline to determine the translation mode
  detectTranslationMode(pipeline);

  pipeline.replaceAll(SparkTransformOverrides.getDefaultOverrides(mOptions.isStreaming()));

  prepareFilesToStage(mOptions);

  if (mOptions.isStreaming()) {
    CheckpointDir checkpointDir = new CheckpointDir(mOptions.getCheckpointDir());
    SparkRunnerStreamingContextFactory streamingContextFactory =
        new SparkRunnerStreamingContextFactory(pipeline, mOptions, checkpointDir);
    final JavaStreamingContext jssc =
        JavaStreamingContext.getOrCreate(
            checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory);

    // Checkpoint aggregator/metrics values
    jssc.addStreamingListener(
        new JavaStreamingListenerWrapper(
            new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
    jssc.addStreamingListener(
        new JavaStreamingListenerWrapper(
            new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));

    // register user-defined listeners.
    for (JavaStreamingListener listener : mOptions.as(SparkContextOptions.class).getListeners()) {
      LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
      jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
    }

    // register Watermarks listener to broadcast the advanced WMs.
    jssc.addStreamingListener(
        new JavaStreamingListenerWrapper(new WatermarkAdvancingStreamingListener()));

    // The reason we call initAccumulators here even though it is called in
    // SparkRunnerStreamingContextFactory is because the factory is not called when resuming
    // from checkpoint (When not resuming from checkpoint initAccumulators will be called twice
    // but this is fine since it is idempotent).
    initAccumulators(mOptions, jssc.sparkContext());

    startPipeline =
        executorService.submit(
            () -> {
              LOG.info("Starting streaming pipeline execution.");
              jssc.start();
            });
    executorService.shutdown();

    result = new SparkPipelineResult.StreamingMode(startPipeline, jssc);
  } else {
    // create the evaluation context
    final JavaSparkContext jsc = SparkContextFactory.getSparkContext(mOptions);
    final EvaluationContext evaluationContext = new EvaluationContext(jsc, pipeline, mOptions);
    translator = new TransformTranslator.Translator();

    // update the cache candidates
    updateCacheCandidates(pipeline, translator, evaluationContext);

    initAccumulators(mOptions, jsc);
    startPipeline =
        executorService.submit(
            () -> {
              pipeline.traverseTopologically(new Evaluator(translator, evaluationContext));
              evaluationContext.computeOutputs();
              LOG.info("Batch pipeline execution complete.");
            });
    executorService.shutdown();

    result = new SparkPipelineResult.BatchMode(startPipeline, jsc);
  }

  if (mOptions.getEnableSparkMetricSinks()) {
    registerMetricsSource(mOptions.getAppName());
  }

  // it would have been better to create MetricsPusher from runner-core but we need
  // runner-specific
  // MetricsContainerStepMap
  MetricsPusher metricsPusher =
      new MetricsPusher(
          MetricsAccumulator.getInstance().value(), mOptions.as(MetricsOptions.class), result);
  metricsPusher.start();
  return result;
}
 
Example 12
Source File: SamzaRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public SamzaPipelineResult run(Pipeline pipeline) {
  MetricsEnvironment.setMetricsSupported(true);

  if (LOG.isDebugEnabled()) {
    LOG.debug("Pre-processed Beam pipeline:\n{}", PipelineDotRenderer.toDotString(pipeline));
  }

  pipeline.replaceAll(SamzaTransformOverrides.getDefaultOverrides());

  final String dotGraph = PipelineDotRenderer.toDotString(pipeline);
  LOG.info("Beam pipeline DOT graph:\n{}", dotGraph);

  final Map<PValue, String> idMap = PViewToIdMapper.buildIdMap(pipeline);
  final ConfigBuilder configBuilder = new ConfigBuilder(options);

  SamzaPipelineTranslator.createConfig(pipeline, options, idMap, configBuilder);
  configBuilder.put(BEAM_DOT_GRAPH, dotGraph);

  final Config config = configBuilder.build();
  options.setConfigOverride(config);

  if (listener != null) {
    listener.onInit(config, options);
  }

  final SamzaExecutionContext executionContext = new SamzaExecutionContext(options);
  final Map<String, MetricsReporterFactory> reporterFactories = getMetricsReporters();

  final StreamApplication app =
      appDescriptor -> {
        appDescriptor.withApplicationContainerContextFactory(executionContext.new Factory());
        appDescriptor.withMetricsReporterFactories(reporterFactories);

        SamzaPipelineTranslator.translate(
            pipeline, new TranslationContext(appDescriptor, idMap, options));
      };

  // perform a final round of validation for the pipeline options now that all configs are
  // generated
  SamzaPipelineOptionsValidator.validate(options);
  ApplicationRunner runner = runSamzaApp(app, config);
  return new SamzaPipelineResult(app, runner, executionContext, listener, config);
}
 
Example 13
Source File: DirectRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public DirectPipelineResult run(Pipeline pipeline) {
  try {
    options =
        MAPPER
            .readValue(MAPPER.writeValueAsBytes(options), PipelineOptions.class)
            .as(DirectOptions.class);
  } catch (IOException e) {
    throw new IllegalArgumentException(
        "PipelineOptions specified failed to serialize to JSON.", e);
  }

  pipeline.replaceAll(defaultTransformOverrides());
  MetricsEnvironment.setMetricsSupported(true);
  try {
    DirectGraphVisitor graphVisitor = new DirectGraphVisitor();
    pipeline.traverseTopologically(graphVisitor);

    @SuppressWarnings("rawtypes")
    KeyedPValueTrackingVisitor keyedPValueVisitor = KeyedPValueTrackingVisitor.create();
    pipeline.traverseTopologically(keyedPValueVisitor);

    DisplayDataValidator.validatePipeline(pipeline);
    DisplayDataValidator.validateOptions(options);

    ExecutorService metricsPool =
        Executors.newCachedThreadPool(
            new ThreadFactoryBuilder()
                .setThreadFactory(MoreExecutors.platformThreadFactory())
                .setDaemon(false) // otherwise you say you want to leak, please don't!
                .setNameFormat("direct-metrics-counter-committer")
                .build());
    DirectGraph graph = graphVisitor.getGraph();
    EvaluationContext context =
        EvaluationContext.create(
            clockSupplier.get(),
            Enforcement.bundleFactoryFor(enabledEnforcements, graph),
            graph,
            keyedPValueVisitor.getKeyedPValues(),
            metricsPool);

    TransformEvaluatorRegistry registry =
        TransformEvaluatorRegistry.javaSdkNativeRegistry(context, options);
    PipelineExecutor executor =
        ExecutorServiceParallelExecutor.create(
            options.getTargetParallelism(),
            registry,
            Enforcement.defaultModelEnforcements(enabledEnforcements),
            context,
            metricsPool);
    executor.start(graph, RootProviderRegistry.javaNativeRegistry(context, options));

    DirectPipelineResult result = new DirectPipelineResult(executor, context);
    if (options.isBlockOnRun()) {
      try {
        result.waitUntilFinish();
      } catch (UserCodeException userException) {
        throw new PipelineExecutionException(userException.getCause());
      } catch (Throwable t) {
        if (t instanceof RuntimeException) {
          throw (RuntimeException) t;
        }
        throw new RuntimeException(t);
      }
    }
    return result;
  } finally {
    MetricsEnvironment.setMetricsSupported(false);
  }
}
 
Example 14
Source File: DirectGraphs.java    From beam with Apache License 2.0 4 votes vote down vote up
public static void performDirectOverrides(Pipeline p) {
  p.replaceAll(
      DirectRunner.fromOptions(PipelineOptionsFactory.create().as(DirectOptions.class))
          .defaultTransformOverrides());
}