org.apache.beam.runners.dataflow.DataflowRunner Java Examples

The following examples show how to use org.apache.beam.runners.dataflow.DataflowRunner. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HopPipelineMetaToBeamPipelineConverter.java    From hop with Apache License 2.0 6 votes vote down vote up
public static Class<? extends PipelineRunner<?>> getPipelineRunnerClass( RunnerType runnerType ) throws HopException {
  if ( runnerType == null ) {
    throw new HopException( "Please specify a valid runner type" );
  }
  switch ( runnerType ) {
    case Direct:
      return DirectRunner.class;
    case Flink:
      return FlinkRunner.class;
    case Spark:
      return SparkRunner.class;
    case DataFlow:
      return DataflowRunner.class;
    default:
      throw new HopException( "Unsupported runner type: " + runnerType.name() );
  }
}
 
Example #2
Source File: WorkerCustomSourcesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
static com.google.api.services.dataflow.model.Source translateIOToCloudSource(
    BoundedSource<?> io, DataflowPipelineOptions options) throws Exception {
  DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
  Pipeline p = Pipeline.create(options);
  p.begin().apply(Read.from(io));

  DataflowRunner runner = DataflowRunner.fromOptions(options);
  SdkComponents sdkComponents = SdkComponents.create();
  RunnerApi.Environment defaultEnvironmentForDataflow =
      Environments.createDockerEnvironment("dummy-image-url");
  sdkComponents.registerEnvironment(defaultEnvironmentForDataflow);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);

  Job workflow =
      translator
          .translate(p, pipelineProto, sdkComponents, runner, new ArrayList<DataflowPackage>())
          .getJob();
  Step step = workflow.getSteps().get(0);

  return stepToCloudSource(step);
}
 
Example #3
Source File: StreamingDataflowWorker.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  JvmInitializers.runOnStartup();

  DataflowWorkerHarnessHelper.initializeLogging(StreamingDataflowWorker.class);
  DataflowWorkerHarnessOptions options =
      DataflowWorkerHarnessHelper.initializeGlobalStateAndPipelineOptions(
          StreamingDataflowWorker.class);
  DataflowWorkerHarnessHelper.configureLogging(options);
  checkArgument(
      options.isStreaming(),
      "%s instantiated with options indicating batch use",
      StreamingDataflowWorker.class.getName());

  checkArgument(
      !DataflowRunner.hasExperiment(options, "beam_fn_api"),
      "%s cannot be main() class with beam_fn_api enabled",
      StreamingDataflowWorker.class.getSimpleName());

  // Create a non fnapi registry
  SdkHarnessRegistry sdkHarnessRegistry = SdkHarnessRegistries.emptySdkHarnessRegistry();
  StreamingDataflowWorker worker =
      StreamingDataflowWorker.fromDataflowWorkerHarnessOptions(options, sdkHarnessRegistry);

  JvmInitializers.runBeforeProcessing(options);
  worker.startStatusPages();
  worker.start();
}
 
Example #4
Source File: BeamPipelineEngine.java    From hop with Apache License 2.0 6 votes vote down vote up
private PipelineResult executePipeline( org.apache.beam.sdk.Pipeline pipeline ) throws HopException {

    RunnerType runnerType = beamEngineRunConfiguration.getRunnerType();
    switch ( runnerType ) {
      case Direct:
        return DirectRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Flink:
        return FlinkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case DataFlow:
        return DataflowRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Spark:
        return SparkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      default:
        throw new HopException( "Execution on runner '" + runnerType.name() + "' is not supported yet." );
    }
  }
 
Example #5
Source File: DataflowJobManager.java    From feast with Apache License 2.0 5 votes vote down vote up
private ImportOptions getPipelineOptions(
    String jobName, SourceProto.Source source, Set<StoreProto.Store> sinks, boolean update)
    throws IOException, IllegalAccessException {
  ImportOptions pipelineOptions =
      PipelineOptionsFactory.fromArgs(defaultOptions.toArgs()).as(ImportOptions.class);

  JsonFormat.Printer jsonPrinter = JsonFormat.printer();

  pipelineOptions.setSpecsStreamingUpdateConfigJson(
      jsonPrinter.print(specsStreamingUpdateConfig));
  pipelineOptions.setSourceJson(jsonPrinter.print(source));
  pipelineOptions.setStoresJson(
      sinks.stream().map(wrapException(jsonPrinter::print)).collect(Collectors.toList()));
  pipelineOptions.setProject(projectId);
  pipelineOptions.setDefaultFeastProject(Project.DEFAULT_NAME);
  pipelineOptions.setUpdate(update);
  pipelineOptions.setRunner(DataflowRunner.class);
  pipelineOptions.setJobName(jobName);
  pipelineOptions.setFilesToStage(
      detectClassPathResourcesToStage(DataflowRunner.class.getClassLoader()));
  if (metrics.isEnabled()) {
    pipelineOptions.setMetricsExporterType(metrics.getType());
    if (metrics.getType().equals("statsd")) {
      pipelineOptions.setStatsdHost(metrics.getHost());
      pipelineOptions.setStatsdPort(metrics.getPort());
    }
  }
  return pipelineOptions;
}
 
Example #6
Source File: PubsubWordCount.java    From cloud-bigtable-examples with Apache License 2.0 5 votes vote down vote up
/**
 * <p>Creates a dataflow pipeline that creates the following chain:</p>
 * <ol>
 *   <li> Reads from a Cloud Pubsub topic
 *   <li> Window into fixed windows of 1 minute
 *   <li> Applies word count transform
 *   <li> Creates Puts from each of the word counts in the array
 *   <li> Performs a Bigtable Put on the items
 * </ol>
 *
 * @param args Arguments to use to configure the Dataflow Pipeline.  The first three are required
 *   when running via managed resource in Google Cloud Platform.  Those options should be omitted
 *   for LOCAL runs.  The next four arguments are to configure the Bigtable connection. The last
 *   two items are for Cloud Pubsub.
 *        --runner=BlockingDataflowPipelineRunner
 *        --project=[dataflow project] \\
 *        --stagingLocation=gs://[your google storage bucket] \\
 *        --bigtableProjectId=[bigtable project] \\
 *        --bigtableInstanceId=[bigtable instance id] \\
 *        --bigtableTableId=[bigtable tableName]
 *        --inputFile=[file path on GCS]
 *        --pubsubTopic=projects/[project name]/topics/[topic name]
 */

public static void main(String[] args) throws Exception {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  BigtablePubsubOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtablePubsubOptions.class);

  // CloudBigtableTableConfiguration contains the project, instance and table to connect to.
  CloudBigtableTableConfiguration config =
      new CloudBigtableTableConfiguration.Builder()
      .withProjectId(options.getBigtableProjectId())
      .withInstanceId(options.getBigtableInstanceId())
      .withTableId(options.getBigtableTableId())
      .build();

  // In order to cancel the pipelines automatically,
  // DataflowPipelineRunner is forced to be used.
  // Also enables the 2 jobs to run at the same time.
  options.setRunner(DataflowRunner.class);

  options.as(DataflowPipelineOptions.class).setStreaming(true);
  Pipeline p = Pipeline.create(options);

  FixedWindows window = FixedWindows.of(Duration.standardMinutes(options.getWindowSize()));

  p
      .apply(PubsubIO.readStrings().fromTopic(options.getPubsubTopic()))
      .apply(Window.<String> into(window))
      .apply(ParDo.of(new ExtractWordsFn()))
      .apply(Count.<String> perElement())
      .apply(ParDo.of(MUTATION_TRANSFORM))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run().waitUntilFinish();
  // Start a second job to inject messages into a Cloud Pubsub topic
  injectMessages(options);
}
 
Example #7
Source File: Spec11Pipeline.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/** Deploys the spec11 pipeline as a template on GCS. */
public void deploy() {
  // We can't store options as a member variable due to serialization concerns.
  Spec11PipelineOptions options = PipelineOptionsFactory.as(Spec11PipelineOptions.class);
  options.setProject(projectId);
  options.setRunner(DataflowRunner.class);
  // This causes p.run() to stage the pipeline as a template on GCS, as opposed to running it.
  options.setTemplateLocation(spec11TemplateUrl);
  options.setStagingLocation(beamStagingUrl);
  // This credential is used when Dataflow deploys the template to GCS in target GCP project.
  // So, make sure the credential has write permission to GCS in that project.
  options.setGcpCredential(googleCredentials);

  Pipeline p = Pipeline.create(options);
  PCollection<Subdomain> domains =
      p.apply(
          "Read active domains from BigQuery",
          BigQueryIO.read(Subdomain::parseFromRecord)
              .fromQuery(
                  SqlTemplate.create(getQueryFromFile(Spec11Pipeline.class, "subdomains.sql"))
                      .put("PROJECT_ID", projectId)
                      .put("DATASTORE_EXPORT_DATASET", "latest_datastore_export")
                      .put("REGISTRAR_TABLE", "Registrar")
                      .put("DOMAIN_BASE_TABLE", "DomainBase")
                      .build())
              .withCoder(SerializableCoder.of(Subdomain.class))
              .usingStandardSql()
              .withoutValidation()
              .withTemplateCompatibility());

  evaluateUrlHealth(
      domains,
      new EvaluateSafeBrowsingFn(options.getSafeBrowsingApiKey(), retrier),
      options.getDate());
  p.run();
}
 
Example #8
Source File: InvoicingPipeline.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/** Deploys the invoicing pipeline as a template on GCS, for a given projectID and GCS bucket. */
public void deploy() {
  // We can't store options as a member variable due to serialization concerns.
  InvoicingPipelineOptions options = PipelineOptionsFactory.as(InvoicingPipelineOptions.class);
  options.setProject(projectId);
  options.setRunner(DataflowRunner.class);
  // This causes p.run() to stage the pipeline as a template on GCS, as opposed to running it.
  options.setTemplateLocation(invoiceTemplateUrl);
  options.setStagingLocation(beamStagingUrl);
  // This credential is used when Dataflow deploys the template to GCS in target GCP project.
  // So, make sure the credential has write permission to GCS in that project.
  options.setGcpCredential(googleCredentials);

  Pipeline p = Pipeline.create(options);

  PCollection<BillingEvent> billingEvents =
      p.apply(
          "Read BillingEvents from Bigquery",
          BigQueryIO.read(BillingEvent::parseFromRecord)
              .fromQuery(InvoicingUtils.makeQueryProvider(options.getYearMonth(), projectId))
              .withCoder(SerializableCoder.of(BillingEvent.class))
              .usingStandardSql()
              .withoutValidation()
              .withTemplateCompatibility());
  applyTerminalTransforms(billingEvents, options.getYearMonth());
  p.run();
}
 
Example #9
Source File: WorkerCustomSourcesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws Exception {
  options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setAppName("TestAppName");
  options.setProject("test-project");
  options.setRegion("some-region1");
  options.setTempLocation("gs://test/temp/location");
  options.setGcpCredential(new TestCredential());
  options.setRunner(DataflowRunner.class);
  options.setPathValidatorClass(NoopPathValidator.class);
}
 
Example #10
Source File: ServerStreamObserverFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
public static ServerStreamObserverFactory fromOptions(PipelineOptions options) {
  DataflowPipelineDebugOptions dataflowOptions = options.as(DataflowPipelineDebugOptions.class);
  if (DataflowRunner.hasExperiment(dataflowOptions, "beam_fn_api_buffered_stream")) {
    int bufferSize = Buffered.DEFAULT_BUFFER_SIZE;
    for (String experiment : dataflowOptions.getExperiments()) {
      if (experiment.startsWith("beam_fn_api_buffered_stream_buffer_size=")) {
        bufferSize =
            Integer.parseInt(
                experiment.substring("beam_fn_api_buffered_stream_buffer_size=".length()));
      }
    }
    return new Buffered(options.as(GcsOptions.class).getExecutorService(), bufferSize);
  }
  return new Direct();
}
 
Example #11
Source File: SourceOperationExecutorFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
public static SourceOperationExecutor create(
    PipelineOptions options,
    SourceOperationRequest request,
    CounterSet counters,
    DataflowExecutionContext<?> executionContext,
    String stageName)
    throws Exception {
  boolean beamFnApi =
      DataflowRunner.hasExperiment(options.as(DataflowPipelineDebugOptions.class), "beam_fn_api");

  Preconditions.checkNotNull(request, "SourceOperationRequest must be non-null");
  Preconditions.checkNotNull(executionContext, "executionContext must be non-null");

  // Disable splitting when fn api is enabled.
  // TODO: Fix this once source splitting is supported.
  if (beamFnApi) {
    return new NoOpSourceOperationExecutor(request);
  } else {
    DataflowOperationContext operationContext =
        executionContext.createOperationContext(
            NameContext.create(
                stageName,
                request.getOriginalName(),
                request.getSystemName(),
                request.getName()));

    return new WorkerCustomSourceOperationExecutor(
        options, request, counters, executionContext, operationContext);
  }
}
 
Example #12
Source File: TransMetaPipelineConverter.java    From kettle-beam with Apache License 2.0 5 votes vote down vote up
public static Class<? extends PipelineRunner<?>> getPipelineRunnerClass( RunnerType runnerType ) throws KettleException {
  if (runnerType==null) {
    throw new KettleException( "Please specify a valid runner type");
  }
  switch(runnerType) {
    case Direct: return DirectRunner.class;
    case Flink: return FlinkRunner.class;
    case Spark: return SparkRunner.class;
    case DataFlow: return DataflowRunner.class;
    default:
      throw new KettleException( "Unsupported runner type: "+runnerType.name() );
  }
}
 
Example #13
Source File: DataflowViewTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private Pipeline createTestStreamingRunner() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setStreaming(true);
  options.setProject("someproject");
  options.setRegion("some-region1");
  options.setGcpTempLocation("gs://staging");
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setDataflowClient(dataflow);
  return Pipeline.create(options);
}
 
Example #14
Source File: DataflowViewTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private Pipeline createTestBatchRunner() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setProject("someproject");
  options.setRegion("some-region1");
  options.setGcpTempLocation("gs://staging");
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setDataflowClient(dataflow);
  return Pipeline.create(options);
}
 
Example #15
Source File: DataflowGroupByKeyTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Create a test pipeline that uses the {@link DataflowRunner} so that {@link GroupByKey} is not
 * expanded. This is used for verifying that even without expansion the proper errors show up.
 */
private Pipeline createTestServiceRunner() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setProject("someproject");
  options.setRegion("some-region1");
  options.setGcpTempLocation("gs://staging");
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setDataflowClient(dataflow);
  return Pipeline.create(options);
}
 
Example #16
Source File: KettleBeamPipelineExecutor.java    From kettle-beam with Apache License 2.0 5 votes vote down vote up
private PipelineResult asyncExecutePipeline( Pipeline pipeline ) throws KettleException {

    RunnerType runnerType = RunnerType.getRunnerTypeByName( transMeta.environmentSubstitute( jobConfig.getRunnerTypeName() ) );
    if (runnerType==null) {
      throw new KettleException( "Runner type '"+jobConfig.getRunnerTypeName()+"' is not recognized");
    }
    switch ( runnerType ) {
      case Direct: return DirectRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Flink: return FlinkRunner.fromOptions(pipeline.getOptions()).run( pipeline );
      case DataFlow: return DataflowRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Spark: return SparkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      default:
        throw new KettleException( "Execution on runner '" + runnerType.name() + "' is not supported yet, sorry." );
    }
  }
 
Example #17
Source File: IntrinsicMapTaskExecutorFactory.java    From beam with Apache License 2.0 4 votes vote down vote up
/**
 * Creates a new {@link DataflowMapTaskExecutor} from the given {@link MapTask} definition using
 * the provided {@link ReaderFactory}.
 */
@Override
public DataflowMapTaskExecutor create(
    InstructionRequestHandler instructionRequestHandler,
    GrpcFnServer<GrpcDataService> grpcDataFnServer,
    Endpoints.ApiServiceDescriptor dataApiServiceDescriptor,
    GrpcFnServer<GrpcStateService> grpcStateFnServer,
    MutableNetwork<Node, Edge> network,
    PipelineOptions options,
    String stageName,
    ReaderFactory readerFactory,
    SinkFactory sinkFactory,
    DataflowExecutionContext<?> executionContext,
    CounterSet counterSet,
    IdGenerator idGenerator) {

  // TODO: remove this once we trust the code paths
  checkArgument(
      !DataflowRunner.hasExperiment(
          options.as(DataflowPipelineDebugOptions.class), "beam_fn_api"),
      "experiment beam_fn_api turned on but non-Fn API MapTaskExecutorFactory invoked");

  // Swap out all the InstructionOutput nodes with OutputReceiver nodes
  Networks.replaceDirectedNetworkNodes(
      network, createOutputReceiversTransform(stageName, counterSet));

  // Swap out all the ParallelInstruction nodes with Operation nodes
  Networks.replaceDirectedNetworkNodes(
      network,
      createOperationTransformForParallelInstructionNodes(
          stageName, network, options, readerFactory, sinkFactory, executionContext));

  // Collect all the operations within the network and attach all the operations as receivers
  // to preceding output receivers.
  List<Operation> topoSortedOperations = new ArrayList<>();
  for (OperationNode node :
      Iterables.filter(Networks.topologicalOrder(network), OperationNode.class)) {
    topoSortedOperations.add(node.getOperation());

    for (Node predecessor :
        Iterables.filter(network.predecessors(node), OutputReceiverNode.class)) {
      ((OutputReceiverNode) predecessor)
          .getOutputReceiver()
          .addOutput((Receiver) node.getOperation());
    }
  }

  if (LOG.isDebugEnabled()) {
    LOG.info("Map task network: {}", Networks.toDot(network));
  }

  return IntrinsicMapTaskExecutor.withSharedCounterSet(
      topoSortedOperations, counterSet, executionContext.getExecutionStateTracker());
}
 
Example #18
Source File: UserParDoFnFactory.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public ParDoFn create(
    PipelineOptions options,
    CloudObject cloudUserFn,
    @Nullable List<SideInputInfo> sideInputInfos,
    TupleTag<?> mainOutputTag,
    Map<TupleTag<?>, Integer> outputTupleTagsToReceiverIndices,
    DataflowExecutionContext<?> executionContext,
    DataflowOperationContext operationContext)
    throws Exception {

  DoFnInstanceManager instanceManager =
      fnCache.get(
          operationContext.nameContext().systemName(),
          () -> DoFnInstanceManagers.cloningPool(doFnExtractor.getDoFnInfo(cloudUserFn)));

  DoFnInfo<?, ?> doFnInfo = instanceManager.peek();

  DataflowExecutionContext.DataflowStepContext stepContext =
      executionContext.getStepContext(operationContext);

  Iterable<PCollectionView<?>> sideInputViews = doFnInfo.getSideInputViews();
  SideInputReader sideInputReader =
      executionContext.getSideInputReader(sideInputInfos, sideInputViews, operationContext);

  if (doFnInfo.getDoFn() instanceof BatchStatefulParDoOverrides.BatchStatefulDoFn) {
    // HACK: BatchStatefulDoFn is a class from DataflowRunner's overrides
    // that just instructs the worker to execute it differently. This will
    // be replaced by metadata in the Runner API payload
    BatchStatefulParDoOverrides.BatchStatefulDoFn fn =
        (BatchStatefulParDoOverrides.BatchStatefulDoFn) doFnInfo.getDoFn();
    DoFn underlyingFn = fn.getUnderlyingDoFn();

    return new BatchModeUngroupingParDoFn(
        (BatchModeExecutionContext.StepContext) stepContext,
        new SimpleParDoFn(
            options,
            DoFnInstanceManagers.singleInstance(doFnInfo.withFn(underlyingFn)),
            sideInputReader,
            doFnInfo.getMainOutput(),
            outputTupleTagsToReceiverIndices,
            stepContext,
            operationContext,
            doFnInfo.getDoFnSchemaInformation(),
            doFnInfo.getSideInputMapping(),
            runnerFactory));

  } else if (doFnInfo.getDoFn() instanceof StreamingPCollectionViewWriterFn) {
    // HACK: StreamingPCollectionViewWriterFn is a class from
    // DataflowPipelineTranslator. Using the class as an indicator is a migration path
    // to simply having an indicator string.

    checkArgument(
        stepContext instanceof StreamingModeExecutionContext.StreamingModeStepContext,
        "stepContext must be a StreamingModeStepContext to use StreamingPCollectionViewWriterFn");
    DataflowRunner.StreamingPCollectionViewWriterFn<Object> writerFn =
        (StreamingPCollectionViewWriterFn<Object>) doFnInfo.getDoFn();
    return new StreamingPCollectionViewWriterParDoFn(
        (StreamingModeExecutionContext.StreamingModeStepContext) stepContext,
        writerFn.getView().getTagInternal(),
        writerFn.getDataCoder(),
        (Coder<BoundedWindow>) doFnInfo.getWindowingStrategy().getWindowFn().windowCoder());
  } else {
    return new SimpleParDoFn(
        options,
        instanceManager,
        sideInputReader,
        doFnInfo.getMainOutput(),
        outputTupleTagsToReceiverIndices,
        stepContext,
        operationContext,
        doFnInfo.getDoFnSchemaInformation(),
        doFnInfo.getSideInputMapping(),
        runnerFactory);
  }
}
 
Example #19
Source File: GroupingShuffleReaderFactory.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Returns true if we should inject errors in the shuffle read bytes counter for testing. */
private static boolean shouldUseGroupingShuffleReaderWithFaultyBytesReadCounter(
    PipelineOptions options) {
  return DataflowRunner.hasExperiment(
      options.as(DataflowPipelineDebugOptions.class), "inject_shuffle_read_count_error");
}
 
Example #20
Source File: BatchDataflowWorker.java    From beam with Apache License 2.0 4 votes vote down vote up
protected BatchDataflowWorker(
    @Nullable RunnerApi.Pipeline pipeline,
    SdkHarnessRegistry sdkHarnessRegistry,
    WorkUnitClient workUnitClient,
    DataflowMapTaskExecutorFactory mapTaskExecutorFactory,
    DataflowWorkerHarnessOptions options) {
  this.mapTaskExecutorFactory = mapTaskExecutorFactory;
  this.sdkHarnessRegistry = sdkHarnessRegistry;
  this.workUnitClient = workUnitClient;
  this.options = options;

  this.sideInputDataCache =
      CacheBuilder.newBuilder()
          .maximumWeight(options.getWorkerCacheMb() * MEGABYTES) // weights are in bytes
          .weigher(Weighers.fixedWeightKeys(OVERHEAD_WEIGHT))
          .softValues()
          .concurrencyLevel(CACHE_CONCURRENCY_LEVEL)
          .build();

  this.sideInputWeakReferenceCache =
      CacheBuilder.newBuilder()
          .maximumSize(MAX_LOGICAL_REFERENCES)
          .weakValues()
          .concurrencyLevel(CACHE_CONCURRENCY_LEVEL)
          .build();

  this.memoryMonitor = MemoryMonitor.fromOptions(options);
  this.statusPages =
      WorkerStatusPages.create(
          DEFAULT_STATUS_PORT, this.memoryMonitor, sdkHarnessRegistry::sdkHarnessesAreHealthy);

  if (!DataflowRunner.hasExperiment(options, "disable_debug_capture")) {
    this.debugCaptureManager =
        initializeAndStartDebugCaptureManager(options, statusPages.getDebugCapturePages());
  }

  // TODO: this conditional -> two implementations of common interface, or
  // param/injection
  if (DataflowRunner.hasExperiment(options, "beam_fn_api")) {
    Function<MutableNetwork<Node, Edge>, MutableNetwork<Node, Edge>> transformToRunnerNetwork;
    Function<MutableNetwork<Node, Edge>, Node> sdkFusedStage;
    Function<MutableNetwork<Node, Edge>, MutableNetwork<Node, Edge>> lengthPrefixUnknownCoders =
        LengthPrefixUnknownCoders::forSdkNetwork;
    if (DataflowRunner.hasExperiment(options, "use_executable_stage_bundle_execution")) {
      sdkFusedStage = new CreateExecutableStageNodeFunction(pipeline, idGenerator);
      transformToRunnerNetwork =
          new CreateRegisterFnOperationFunction(
              idGenerator,
              this::createPortNode,
              lengthPrefixUnknownCoders.andThen(sdkFusedStage),
              true);
    } else {
      sdkFusedStage =
          pipeline == null
              ? RegisterNodeFunction.withoutPipeline(
                  idGenerator,
                  sdkHarnessRegistry.beamFnStateApiServiceDescriptor(),
                  sdkHarnessRegistry.beamFnDataApiServiceDescriptor())
              : RegisterNodeFunction.forPipeline(
                  pipeline,
                  idGenerator,
                  sdkHarnessRegistry.beamFnStateApiServiceDescriptor(),
                  sdkHarnessRegistry.beamFnDataApiServiceDescriptor());
      transformToRunnerNetwork =
          new CreateRegisterFnOperationFunction(
              idGenerator,
              this::createPortNode,
              lengthPrefixUnknownCoders.andThen(sdkFusedStage),
              false);
    }
    mapTaskToNetwork =
        mapTaskToBaseNetwork
            .andThen(new ReplacePgbkWithPrecombineFunction())
            .andThen(new DeduceNodeLocationsFunction())
            .andThen(new DeduceFlattenLocationsFunction())
            .andThen(new CloneAmbiguousFlattensFunction())
            .andThen(transformToRunnerNetwork)
            .andThen(LengthPrefixUnknownCoders::andReplaceForRunnerNetwork);
  } else {
    mapTaskToNetwork = mapTaskToBaseNetwork;
  }

  this.memoryMonitorThread = startMemoryMonitorThread(memoryMonitor);

  ExecutionStateSampler.instance().start();
}
 
Example #21
Source File: WorkerCustomSourcesSplitOnlySourceTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testAllSplitsAreReturned() throws Exception {
  final long apiSizeLimitForTest = 500 * 1024;
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setAppName("TestAppName");
  options.setProject("test-project");
  options.setRegion("some-region1");
  options.setTempLocation("gs://test/temp/location");
  options.setGcpCredential(new TestCredential());
  options.setRunner(DataflowRunner.class);
  options.setPathValidatorClass(NoopPathValidator.class);
  // Generate a CountingSource and split it into the desired number of splits
  // (desired size = 1 byte), triggering the re-split with a larger bundle size.
  // Thus below we expect to produce 'numberOfSplits' splits.
  com.google.api.services.dataflow.model.Source source =
      WorkerCustomSourcesTest.translateIOToCloudSource(
          CountingSource.upTo(numberOfSplits), options);
  SourceSplitResponse split =
      WorkerCustomSourcesTest.performSplit(
          source, options, 1L, null /* numBundles limit */, apiSizeLimitForTest);
  assertThat(
      split.getBundles().size(),
      lessThanOrEqualTo(WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT));

  List<OffsetBasedSource<?>> originalSplits = new ArrayList<>(numberOfSplits);
  // Collect all the splits
  for (DerivedSource derivedSource : split.getBundles()) {
    Object deserializedSource =
        WorkerCustomSources.deserializeFromCloudSource(derivedSource.getSource().getSpec());
    if (deserializedSource instanceof SplittableOnlyBoundedSource) {
      SplittableOnlyBoundedSource<?> splittableOnlySource =
          (SplittableOnlyBoundedSource<?>) deserializedSource;
      originalSplits.addAll((List) splittableOnlySource.split(1L, options));
    } else {
      originalSplits.add((OffsetBasedSource<?>) deserializedSource);
    }
  }

  assertEquals(numberOfSplits, originalSplits.size());
  for (int i = 0; i < originalSplits.size(); i++) {
    OffsetBasedSource<?> offsetBasedSource = (OffsetBasedSource<?>) originalSplits.get(i);
    assertEquals(i, offsetBasedSource.getStartOffset());
    assertEquals(i + 1, offsetBasedSource.getEndOffset());
  }
}
 
Example #22
Source File: PubSubToBQPipeline.java    From pubsub-to-bigquery with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws GeneralSecurityException, IOException, ParseException, ParserConfigurationException, SAXException {
	String params = null;
	for (int i = 0; i < args.length; i++) {
		if (args[i].startsWith("--params="))
			params = args[i].replaceFirst("--params=", "");
	}

	System.out.println(params);
	init(params);

	GoogleCredentials credentials = ServiceAccountCredentials.fromStream(new FileInputStream(keyFile))
	        .createScoped(Arrays.asList(new String[] { "https://www.googleapis.com/auth/cloud-platform" }));

	DataflowPipelineOptions options = PipelineOptionsFactory.create().as(DataflowPipelineOptions.class);
	
	options.setRunner(DataflowRunner.class);
	// Your project ID is required in order to run your pipeline on the Google Cloud.
	options.setProject(projectId);
	// Your Google Cloud Storage path is required for staging local files.
	options.setStagingLocation(workingBucket);
	options.setTempLocation(workingBucket + "/temp");
	options.setGcpCredential(credentials);
	options.setServiceAccount(accountEmail);
	options.setMaxNumWorkers(maxNumWorkers);
	options.setDiskSizeGb(diskSizeGb);
	options.setWorkerMachineType(machineType);
	options.setAutoscalingAlgorithm(AutoscalingAlgorithmType.THROUGHPUT_BASED);
	options.setZone(zone);
	options.setStreaming(isStreaming);
	options.setJobName(pipelineName);
	Pipeline pipeline = Pipeline.create(options);
	
	Gson gson = new Gson();
	TableSchema schema = gson.fromJson(schemaStr, TableSchema.class);
	
	PCollection<String> streamData = null;
	if(pubSubTopicSub != null && !StringUtils.isEmpty(pubSubTopicSub)){
		streamData = pipeline.apply("ReadPubSub",PubsubIO.readStrings().fromSubscription(String.format("projects/%1$s/subscriptions/%2$s",projectId,pubSubTopicSub)));
	}
	else if(pubSubTopic != null && !StringUtils.isEmpty(pubSubTopic)){
		streamData = pipeline.apply("ReadPubSub",PubsubIO.readStrings().fromTopic(String.format("projects/%1$s/topics/%2$s",projectId,pubSubTopic)));
	}
	
	PCollection<TableRow> tableRow = streamData.apply("ToTableRow",ParDo.of(new PrepData.ToTableRow(owTimestamp, debugMode)));
	
	
	tableRow.apply("WriteToBQ",
			BigQueryIO.writeTableRows()
			.to(String.format("%1$s.%2$s",bqDataSet, bqTable))
			.withSchema(schema)
			.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));

	System.out.println("Starting pipeline " + pipelineName);
	pipeline.run();
}