Java Code Examples for org.apache.beam.sdk.io.FileSystems#setDefaultPipelineOptions()

The following examples show how to use org.apache.beam.sdk.io.FileSystems#setDefaultPipelineOptions() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroToCsv.java    From java-docs-samples with Apache License 2.0 6 votes vote down vote up
public static void runAvroToCsv(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert Avro To CSV
  pipeline.apply("Read Avro files",
      AvroIO.readGenericRecords(schemaJson).from(options.getInputFile()))
      .apply("Convert Avro to CSV formatted data",
          ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter())))
      .apply("Write CSV formatted data", TextIO.write().to(options.getOutput())
          .withSuffix(".csv"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example 2
Source File: BeamJpaModuleTest.java    From nomulus with Apache License 2.0 6 votes vote down vote up
/**
 * Integration test with a GCP project, only run when the 'test.gcp_integration.env' property is
 * defined. Otherwise this test is ignored. This is meant to be run from a developer's desktop,
 * with auth already set up by gcloud.
 *
 * <p>Example: {@code gradlew test -P test.gcp_integration.env=alpha}.
 *
 * <p>See <a href="../../../../../../../../java_common.gradle">java_common.gradle</a> for more
 * information.
 */
@Test
public void getJpaTransactionManager_cloudSql_authRequired() {
  String environmentName = System.getProperty("test.gcp_integration.env");
  assumeThat(environmentName, notNullValue());

  FileSystems.setDefaultPipelineOptions(PipelineOptionsFactory.create());
  JpaTransactionManager jpa =
      DaggerBeamJpaModule_JpaTransactionManagerComponent.builder()
          .beamJpaModule(
              new BeamJpaModule(
                  BackupPaths.getCloudSQLCredentialFilePatterns(environmentName).get(0)))
          .build()
          .cloudSqlJpaTransactionManager();
  assertThat(
          jpa.transact(
              () -> jpa.getEntityManager().createNativeQuery("select 1").getSingleResult()))
      .isEqualTo(1);
}
 
Example 3
Source File: PipelineRunner.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Constructs a runner from the provided {@link PipelineOptions}.
 *
 * @return The newly created runner.
 */
public static PipelineRunner<? extends PipelineResult> fromOptions(PipelineOptions options) {
  checkNotNull(options);
  PipelineOptionsValidator.validate(PipelineOptions.class, options);

  // (Re-)register standard FileSystems. Clobbers any prior credentials.
  FileSystems.setDefaultPipelineOptions(options);

  @SuppressWarnings("unchecked")
  PipelineRunner<? extends PipelineResult> result =
      InstanceBuilder.ofType(PipelineRunner.class)
          .fromClass(options.getRunner())
          .fromFactoryMethod("fromOptions")
          .withArg(PipelineOptions.class, options)
          .build();
  return result;
}
 
Example 4
Source File: DataflowPipelineOptionsTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testDefaultToGcpTempLocation() {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  FileSystems.setDefaultPipelineOptions(options);
  options.setPathValidatorClass(NoopPathValidator.class);
  options.setTempLocation("gs://temp_location/");
  options.setGcpTempLocation("gs://gcp_temp_location/");
  assertEquals("gs://gcp_temp_location/staging/", options.getStagingLocation());
}
 
Example 5
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private Pipeline buildDataflowPipeline(DataflowPipelineOptions options) {
  options.setStableUniqueNames(CheckEnabled.ERROR);
  options.setRunner(DataflowRunner.class);
  Pipeline p = Pipeline.create(options);

  p.apply("ReadMyFile", TextIO.read().from("gs://bucket/object"))
      .apply("WriteMyFile", TextIO.write().to("gs://bucket/object"));

  // Enable the FileSystems API to know about gs:// URIs in this test.
  FileSystems.setDefaultPipelineOptions(options);

  return p;
}
 
Example 6
Source File: BeamEnumerableConverter.java    From beam with Apache License 2.0 5 votes vote down vote up
public static PipelineOptions createPipelineOptions(Map<String, String> map) {
  final String[] args = new String[map.size()];
  int i = 0;
  for (Map.Entry<String, String> entry : map.entrySet()) {
    args[i++] = "--" + entry.getKey() + "=" + entry.getValue();
  }
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().create();
  FileSystems.setDefaultPipelineOptions(options);
  options.as(ApplicationNameOptions.class).setAppName("BeamSql");
  return options;
}
 
Example 7
Source File: FlinkExecutableStageFunction.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void open(Configuration parameters) {
  FlinkPipelineOptions options = pipelineOptions.get().as(FlinkPipelineOptions.class);
  // Register standard file systems.
  FileSystems.setDefaultPipelineOptions(options);
  executableStage = ExecutableStage.fromPayload(stagePayload);
  runtimeContext = getRuntimeContext();
  metricContainer = new FlinkMetricContainer(runtimeContext);
  // TODO: Wire this into the distributed cache and make it pluggable.
  stageContext = contextFactory.get(jobInfo);
  stageBundleFactory = stageContext.getStageBundleFactory(executableStage);
  // NOTE: It's safe to reuse the state handler between partitions because each partition uses the
  // same backing runtime context and broadcast variables. We use checkState below to catch errors
  // in backward-incompatible Flink changes.
  stateRequestHandler =
      getStateRequestHandler(
          executableStage, stageBundleFactory.getProcessBundleDescriptor(), runtimeContext);
  progressHandler =
      new BundleProgressHandler() {
        @Override
        public void onProgress(ProcessBundleProgressResponse progress) {
          metricContainer.updateMetrics(stepName, progress.getMonitoringInfosList());
        }

        @Override
        public void onCompleted(ProcessBundleResponse response) {
          metricContainer.updateMetrics(stepName, response.getMonitoringInfosList());
        }
      };
}
 
Example 8
Source File: DoFnOperator.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void setup(
    StreamTask<?, ?> containingTask,
    StreamConfig config,
    Output<StreamRecord<WindowedValue<OutputT>>> output) {

  // make sure that FileSystems is initialized correctly
  FileSystems.setDefaultPipelineOptions(serializedOptions.get());

  super.setup(containingTask, config, output);
}
 
Example 9
Source File: SparkJobServerDriver.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.create();
  // Limiting gcs upload buffer to reduce memory usage while doing parallel artifact uploads.
  options.as(GcsOptions.class).setGcsUploadBufferSizeBytes(1024 * 1024);
  // Register standard file systems.
  FileSystems.setDefaultPipelineOptions(options);
  fromParams(args).run();
}
 
Example 10
Source File: S3ResourceIdTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testResourceIdTester() {
  S3Options options = PipelineOptionsFactory.create().as(S3Options.class);
  options.setAwsRegion("us-west-1");
  FileSystems.setDefaultPipelineOptions(options);
  ResourceIdTester.runResourceIdBattery(S3ResourceId.fromUri("s3://bucket/foo/"));
}
 
Example 11
Source File: FlinkMultiOutputPruningFunction.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void open(Configuration parameters) {
  // Initialize FileSystems for any coders which may want to use the FileSystem,
  // see https://issues.apache.org/jira/browse/BEAM-8303
  FileSystems.setDefaultPipelineOptions(options.get());
}
 
Example 12
Source File: DedupingOperator.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void open() {
  // Initialize FileSystems for any coders which may want to use the FileSystem,
  // see https://issues.apache.org/jira/browse/BEAM-8303
  FileSystems.setDefaultPipelineOptions(options.get());
}
 
Example 13
Source File: SparkPipelineRunner.java    From beam with Apache License 2.0 4 votes vote down vote up
/**
 * Main method to be called only as the entry point to an executable jar with structure as defined
 * in {@link PortablePipelineJarUtils}.
 */
public static void main(String[] args) throws Exception {
  // Register standard file systems.
  FileSystems.setDefaultPipelineOptions(PipelineOptionsFactory.create());

  SparkPipelineRunnerConfiguration configuration = parseArgs(args);
  String baseJobName =
      configuration.baseJobName == null
          ? PortablePipelineJarUtils.getDefaultJobName()
          : configuration.baseJobName;
  Preconditions.checkArgument(
      baseJobName != null,
      "No default job name found. Job name must be set using --base-job-name.");
  Pipeline pipeline = PortablePipelineJarUtils.getPipelineFromClasspath(baseJobName);
  Struct originalOptions = PortablePipelineJarUtils.getPipelineOptionsFromClasspath(baseJobName);

  // Spark pipeline jars distribute and retrieve artifacts via the classpath.
  PortablePipelineOptions portablePipelineOptions =
      PipelineOptionsTranslation.fromProto(originalOptions).as(PortablePipelineOptions.class);
  portablePipelineOptions.setRetrievalServiceType(RetrievalServiceType.CLASSLOADER);
  String retrievalToken = PortablePipelineJarUtils.getArtifactManifestUri(baseJobName);

  SparkPipelineOptions sparkOptions = portablePipelineOptions.as(SparkPipelineOptions.class);
  String invocationId =
      String.format("%s_%s", sparkOptions.getJobName(), UUID.randomUUID().toString());
  if (sparkOptions.getAppName() == null) {
    LOG.debug("App name was null. Using invocationId {}", invocationId);
    sparkOptions.setAppName(invocationId);
  }

  SparkPipelineRunner runner = new SparkPipelineRunner(sparkOptions);
  JobInfo jobInfo =
      JobInfo.create(
          invocationId,
          sparkOptions.getJobName(),
          retrievalToken,
          PipelineOptionsTranslation.toProto(sparkOptions));
  try {
    runner.run(pipeline, jobInfo);
  } catch (Exception e) {
    throw new RuntimeException(String.format("Job %s failed.", invocationId), e);
  }
  LOG.info("Job {} finished successfully.", invocationId);
}
 
Example 14
Source File: SerializablePipelineOptions.java    From beam with Apache License 2.0 4 votes vote down vote up
private void readObject(ObjectInputStream is) throws IOException, ClassNotFoundException {
  is.defaultReadObject();
  this.options = deserializeFromJson(serializedPipelineOptions);
  // TODO https://issues.apache.org/jira/browse/BEAM-2712: remove this call.
  FileSystems.setDefaultPipelineOptions(options);
}
 
Example 15
Source File: FlinkStreamingPortablePipelineTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void open(Configuration parameters) {
  // Initialize FileSystems for any coders which may want to use the FileSystem,
  // see https://issues.apache.org/jira/browse/BEAM-8303
  FileSystems.setDefaultPipelineOptions(options.get());
}
 
Example 16
Source File: SerializablePipelineOptions.java    From beam with Apache License 2.0 4 votes vote down vote up
public SerializablePipelineOptions(PipelineOptions options) {
  this.serializedPipelineOptions = serializeToJson(options);
  this.options = options;
  FileSystems.setDefaultPipelineOptions(options);
}
 
Example 17
Source File: BeamPipelineEngine.java    From hop with Apache License 2.0 4 votes vote down vote up
@Override public void prepareExecution() throws HopException {
  ClassLoader oldContextClassLoader = Thread.currentThread().getContextClassLoader();
  try {
    executionStartDate = new Date();

    // Explain to various classes in the Beam API (@see org.apache.beam.sdk.io.FileSystems)
    // what the context classloader is.
    // Set it back when we're done here.
    //
    Thread.currentThread().setContextClassLoader( this.getClass().getClassLoader() );

    setPreparing( true );
    IPipelineEngineRunConfiguration engineRunConfiguration = pipelineRunConfiguration.getEngineRunConfiguration();
    validatePipelineRunConfigurationClass( engineRunConfiguration );
    if ( !( engineRunConfiguration instanceof IBeamPipelineEngineRunConfiguration ) ) {
      throw new HopException( "A beam pipeline needs a beam pipeline engine configuration to run, not '" + pipelineRunConfiguration.getName() + "'" );
    }
    if ( metadataProvider == null ) {
      throw new HopException( "The beam pipeline engine didn't receive a metadata" );
    }

    beamEngineRunConfiguration = (IBeamPipelineEngineRunConfiguration) engineRunConfiguration;

    converter = new HopPipelineMetaToBeamPipelineConverter( pipelineMeta, metadataProvider, beamEngineRunConfiguration );

    beamPipeline = converter.createPipeline();

    FileSystems.setDefaultPipelineOptions( beamPipeline.getOptions() );


    // Create a new log channel when we start the action
    // It's only now that we use it
    //
    logChannel.logBasic( "Executing this pipeline using the Beam Pipeline Engine with run configuration '" + pipelineRunConfiguration.getName() + "'" );

    PipelineExecutionConfiguration pipelineExecutionConfiguration = new PipelineExecutionConfiguration();
    pipelineExecutionConfiguration.setRunConfiguration( pipelineRunConfiguration.getName() );
    if ( logLevel != null ) {
      pipelineExecutionConfiguration.setLogLevel( logLevel );
    }
    if ( previousResult != null ) {
      pipelineExecutionConfiguration.setPreviousResult( previousResult );
    }

    setRunning( false );
    setReadyToStart( true );
  } catch ( Exception e ) {
    setRunning( false );
    setReadyToStart( false );
    setStopped( true );
    setErrors( getErrors() + 1 );
    setPaused( false );
    setPreparing( false );
    throw new HopException( "Error preparing remote pipeline", e );
  } finally {
    setPreparing( false );
    Thread.currentThread().setContextClassLoader( oldContextClassLoader );
  }
}
 
Example 18
Source File: SpannerToText.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs a pipeline which reads in Records from Spanner, passes in the CSV records to a Javascript
 * UDF, and writes the CSV to TextIO sink.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  LOG.info("Starting pipeline setup");
  PipelineOptionsFactory.register(SpannerToTextOptions.class);
  SpannerToTextOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(SpannerToTextOptions.class);

  FileSystems.setDefaultPipelineOptions(options);
  Pipeline pipeline = Pipeline.create(options);

  SpannerConfig spannerConfig =
      SpannerConfig.create()
          .withProjectId(options.getSpannerProjectId())
          .withInstanceId(options.getSpannerInstanceId())
          .withDatabaseId(options.getSpannerDatabaseId());

  PTransform<PBegin, PCollection<ReadOperation>> spannerExport =
      SpannerConverters.ExportTransformFactory.create(
          options.getSpannerTable(), spannerConfig, options.getTextWritePrefix());

  PCollection<String> csv =
      pipeline
          .apply("Create export", spannerExport)
          // We need to use SpannerIO.readAll() instead of SpannerIO.read()
          // because ValueProvider parameters such as table name required for SpannerIO.read()
          // can be read only inside DoFn but SpannerIO.read() is of type
          // PTransform<PBegin, Struct>, which prevents prepending it with DoFn that reads these
          // parameters at the pipeline execution time.
          .apply("Read all records", LocalSpannerIO.readAll().withSpannerConfig(spannerConfig))
          .apply(
              "Struct To Csv",
              MapElements.into(TypeDescriptors.strings())
                  .via(struct -> (new SpannerConverters.StructCsvPrinter()).print(struct)));

  if (options.getJavascriptTextTransformGcsPath().isAccessible()) {
    // The UDF function takes a CSV row as an input and produces a transformed CSV row
    csv =
        csv.apply(
            "JavascriptUDF",
            TransformTextViaJavascript.newBuilder()
                .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
                .setFunctionName(options.getJavascriptTextTransformFunctionName())
                .build());
  }
  csv.apply(
      "Write to storage", TextIO.write().to(options.getTextWritePrefix()).withSuffix(".csv"));

  pipeline.run();
  LOG.info("Completed pipeline setup");
}
 
Example 19
Source File: KeyStoreIntegrationTest.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
/**
 * Upload a metadata file and the referenced private keys to their testing
 * locations. The resource is a templated metadata json file. "DUMMY_*"
 * variables are replaced with their corresponding locations. This also
 * encrypts the private keys and ensures that the KMS resources are created if
 * specified.
 */
private String prepareKeyStoreMetadata(String resource, boolean shouldEncrypt) throws Exception {
  // enable gs support
  FileSystems.setDefaultPipelineOptions(PipelineOptionsFactory.create());

  byte[] data = Resources.toByteArray(Resources.getResource(resource));
  ArrayNode nodes = Json.readArrayNode(data);
  for (JsonNode node : nodes) {
    // replace dummy values with values related to integration testing
    String kmsResourceId = node.get("kms_resource_id").textValue().replace("DUMMY_PROJECT_ID",
        projectId);

    // The path may be on the local filesystem or in cloud storage by
    // referencing a variable to be replaced.
    String privateKeyUri = node.get("private_key_uri").textValue().replace("DUMMY_BUCKET", bucket)
        .replace("DUMMY_TEMP_FOLDER", tempFolder.getRoot().toString());
    ((ObjectNode) node).put("kms_resource_id", kmsResourceId);
    ((ObjectNode) node).put("private_key_uri", privateKeyUri);

    String privateKeyId = node.get("private_key_id").textValue();
    byte[] key = Resources.toByteArray(
        Resources.getResource(String.format("pioneer/%s.private.json", privateKeyId)));

    // optionally encrypt the private key resources and upload to testing location
    if (shouldEncrypt) {
      try (KeyManagementServiceClient client = KeyManagementServiceClient.create()) {
        ensureKmsResources(client, kmsResourceId);
        byte[] encryptedKey = encrypt(client, kmsResourceId, key);
        writeToStorage(privateKeyUri, encryptedKey);
      }
    } else {
      writeToStorage(privateKeyUri, key);
    }
  }
  assertFalse(nodes.asText().contains("DUMMY_PROJECT_ID")
      || nodes.asText().contains("DUMMY_BUCKET") || nodes.asText().contains("DUMMY_TEMP_FOLDER"));

  String keyStoreMetadata = String.format("gs://%s/metadata.json", bucket);
  writeToStorage(keyStoreMetadata, nodes.toString().getBytes("UTF-8"));
  return keyStoreMetadata;
}
 
Example 20
Source File: KettleBeamPipelineExecutor.java    From kettle-beam with Apache License 2.0 4 votes vote down vote up
public Pipeline getPipeline( TransMeta transMeta, BeamJobConfig config ) throws KettleException {

    try {

      if ( StringUtils.isEmpty( config.getRunnerTypeName() ) ) {
        throw new KettleException( "You need to specify a runner type, one of : " + RunnerType.values().toString() );
      }
      PipelineOptions pipelineOptions = null;
      VariableSpace space = transMeta;

      RunnerType runnerType = RunnerType.getRunnerTypeByName( transMeta.environmentSubstitute( config.getRunnerTypeName() ) );
      switch ( runnerType ) {
        case Direct:
          pipelineOptions = PipelineOptionsFactory.create();
          break;
        case DataFlow:
          DataflowPipelineOptions dfOptions = PipelineOptionsFactory.as( DataflowPipelineOptions.class );
          configureDataFlowOptions( config, dfOptions, space );
          pipelineOptions = dfOptions;
          break;
        case Spark:
          SparkPipelineOptions sparkOptions;
          if (sparkContext!=null) {
            SparkContextOptions sparkContextOptions = PipelineOptionsFactory.as( SparkContextOptions.class );
            sparkContextOptions.setProvidedSparkContext( sparkContext );
            sparkOptions = sparkContextOptions;
          } else {
            sparkOptions = PipelineOptionsFactory.as( SparkPipelineOptions.class );
          }
          configureSparkOptions( config, sparkOptions, space, transMeta.getName() );
          pipelineOptions = sparkOptions;
          break;
        case Flink:
          FlinkPipelineOptions flinkOptions = PipelineOptionsFactory.as( FlinkPipelineOptions.class );
          configureFlinkOptions( config, flinkOptions, space );
          pipelineOptions = flinkOptions;
          break;
        default:
          throw new KettleException( "Sorry, this isn't implemented yet" );
      }

      configureStandardOptions( config, transMeta.getName(), pipelineOptions, space );

      setVariablesInTransformation( config, transMeta );

      TransMetaPipelineConverter converter;
      if (stepPluginClasses!=null && xpPluginClasses!=null) {
        converter = new TransMetaPipelineConverter( transMeta, metaStore, stepPluginClasses, xpPluginClasses, jobConfig );
      } else {
        converter = new TransMetaPipelineConverter( transMeta, metaStore, config.getPluginsToStage(), jobConfig );
      }
      Pipeline pipeline = converter.createPipeline( pipelineOptions );

      // Also set the pipeline options...
      //
      FileSystems.setDefaultPipelineOptions(pipelineOptions);

      return pipeline;
    } catch ( Exception e ) {
      throw new KettleException( "Error configuring local Beam Engine", e );
    }

  }