org.apache.beam.sdk.io.TextIO Java Exaples

Source File: WordCount.java From java-docs-samples with Apache License 2.0

7 votes

public static void main(String[] args) {
  WordCountOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation().as(WordCountOptions.class);

  Pipeline pipeline = Pipeline.create(options);
  pipeline
      .apply("Read lines", TextIO.read().from(options.getInputFile()))
      // [END value_provider]
      .apply("Find words", FlatMapElements.into(TypeDescriptors.strings())
          .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
      .apply("Filter empty words", Filter.by((String word) -> !word.isEmpty()))
      .apply("Filter with substring", ParDo.of(new FilterWithSubstring(
          options.getWithSubstring(), options.getIsCaseSensitive())))
      .apply("Count words", Count.perElement())
      .apply("Format results", MapElements.into(TypeDescriptors.strings())
          .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
      // [START nested_value_provider]
      .apply("Write results", TextIO.write().to(NestedValueProvider.of(
          options.getOutputBucket(),
          (String bucket) -> String.format("gs://%s/samples/dataflow/wordcount/outputs", bucket)
      )));
      // [END nested_value_provider]
  pipeline.run();
}

Source File: InvoicingPipeline.java From nomulus with Apache License 2.0

6 votes

/** Returns an IO transform that writes the overall invoice to a single CSV file. */
private TextIO.Write writeInvoice(ValueProvider<String> yearMonthProvider) {
  return TextIO.write()
      .to(
          NestedValueProvider.of(
              yearMonthProvider,
              yearMonth ->
                  String.format(
                      "%s/%s/%s/%s-%s",
                      billingBucketUrl,
                      BillingModule.INVOICES_DIRECTORY,
                      yearMonth,
                      invoiceFilePrefix,
                      yearMonth)))
      .withHeader(InvoiceGroupingKey.invoiceHeader())
      .withoutSharding()
      .withSuffix(".csv");
}

Source File: UserScore.java From deployment-examples with MIT License

6 votes

/** Run a batch pipeline. */
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
  // Begin constructing a pipeline configured by commandline flags.
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline pipeline = Pipeline.create(options);

  // Read events from a text file and parse them.
  pipeline
      .apply(TextIO.read().from(options.getInput()))
      .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
      // Extract and sum username/score pairs from the event data.
      .apply("ExtractUserScore", new ExtractAndSumScore("user"))
      .apply(
          "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false));

  // Run the batch pipeline.
  pipeline.run().waitUntilFinish();
}

Source File: BeamSqlDataCatalogExample.java From beam with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
  LOG.info("Args: {}", Arrays.asList(args));
  DCExamplePipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).as(DCExamplePipelineOptions.class);
  LOG.info("Query: {}\nOutput: {}", options.getQueryString(), options.getOutputFilePrefix());

  Pipeline pipeline = Pipeline.create(options);

  validateArgs(options);

  try (DataCatalogTableProvider tableProvider =
      DataCatalogTableProvider.create(options.as(DataCatalogPipelineOptions.class))) {
    pipeline
        .apply(
            "SQL Query",
            SqlTransform.query(options.getQueryString())
                .withDefaultTableProvider("datacatalog", tableProvider))
        .apply("Convert to Strings", rowsToStrings())
        .apply("Write output", TextIO.write().to(options.getOutputFilePrefix()));

    pipeline.run().waitUntilFinish();
  }
}

Source File: TaskTest.java From beam with Apache License 2.0

6 votes

@Test
public void textIO() {
  PCollection<String> countries =
      testPipeline.apply(TextIO.read().from("countries.txt"));

  PCollection<String> results = Task.applyTransform(countries);

  PAssert.that(results)
      .containsInAnyOrder(
          "AUSTRALIA",
          "CHINA",
          "ENGLAND",
          "FRANCE",
          "GERMANY",
          "INDONESIA",
          "JAPAN",
          "MEXICO",
          "SINGAPORE",
          "UNITED STATES"
      );

  testPipeline.run().waitUntilFinish();
}

Source File: DataflowRunnerTest.java From beam with Apache License 2.0

6 votes

/** Tests that all reads are consumed by at least one {@link PTransform}. */
@Test
public void testUnconsumedReads() throws IOException {
  DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
  RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
  Pipeline p = buildDataflowPipeline(dataflowOptions);
  p.apply(TextIO.read().from(options.getInput()));
  DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
  final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
  p.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumedSeenAsInput.set(true);
        }
      });
  assertThat(unconsumedSeenAsInput.get(), is(true));
}

Source File: NumShardsTest.java From beam with Apache License 2.0

6 votes

@Test
public void testText() throws Exception {
  PCollection<String> inputWords = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
  PCollection<String> output =
      inputWords
          .apply(new WordCount.CountWords())
          .apply(MapElements.via(new WordCount.FormatAsTextFn()));
  output.apply(
      TextIO.write().to(outputDir.getAbsolutePath()).withNumShards(3).withSuffix(".txt"));
  p.run().waitUntilFinish();

  int count = 0;
  Set<String> expected = Sets.newHashSet("hi: 5", "there: 1", "sue: 2", "bob: 2");
  for (File f :
      tmpDir.getRoot().listFiles(pathname -> pathname.getName().matches("out-.*\\.txt"))) {
    count++;
    for (String line : Files.readLines(f, StandardCharsets.UTF_8)) {
      assertTrue(line + " not found", expected.remove(line));
    }
  }
  assertEquals(3, count);
  assertTrue(expected.toString(), expected.isEmpty());
}

Source File: DebuggingWordCount.java From deployment-examples with MIT License

6 votes

static void runDebuggingWordCount(WordCountOptions options) {
  Pipeline p = Pipeline.create(options);

  PCollection<KV<String, Long>> filteredWords =
      p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
          .apply(new WordCount.CountWords())
          .apply(ParDo.of(new FilterTextFn(options.getFilterPattern())));

  /*
   * Concept #3: PAssert is a set of convenient PTransforms in the style of
   * Hamcrest's collection matchers that can be used when writing Pipeline level tests
   * to validate the contents of PCollections. PAssert is best used in unit tests
   * with small data sets but is demonstrated here as a teaching tool.
   *
   * <p>Below we verify that the set of filtered words matches our expected counts. Note
   * that PAssert does not provide any output and that successful completion of the
   * Pipeline implies that the expectations were met. Learn more at
   * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test
   * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
   */
  List<KV<String, Long>> expectedResults =
      Arrays.asList(KV.of("Flourish", 3L), KV.of("stomach", 1L));
  PAssert.that(filteredWords).containsInAnyOrder(expectedResults);

  p.run().waitUntilFinish();
}

Source File: CsvToAvro.java From java-docs-samples with Apache License 2.0

6 votes

public static void runCsvToAvro(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert CSV to Avro
  pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data",
          ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema))
      .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
          .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}

Source File: UserScore.java From beam with Apache License 2.0

6 votes

/** Run a batch pipeline. */
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
  // Begin constructing a pipeline configured by commandline flags.
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline pipeline = Pipeline.create(options);

  // Read events from a text file and parse them.
  pipeline
      .apply(TextIO.read().from(options.getInput()))
      .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
      // Extract and sum username/score pairs from the event data.
      .apply("ExtractUserScore", new ExtractAndSumScore("user"))
      .apply(
          "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false));

  // Run the batch pipeline.
  pipeline.run().waitUntilFinish();
}

Source File: CsvConverters.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}

Source File: TextToPubsubStream.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
    .apply(
      "Read Text Data",
      TextIO.read()
        .from(options.getInputFilePattern())
        .watchForNewFiles(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
    .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}

Source File: DatastoreSchemasCountToText.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Runs a pipeline which reads in Entities from datastore, parses the Entity's schema,
 * and counts the unique number of schemas.
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreSchemaCountToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreSchemaCountToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(DatastoreReadSchemaCount.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}

Source File: TextToPubsub.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
      .apply("Read Text Data", TextIO.read().from(options.getInputFilePattern()))
      .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}

Source File: Main.java From component-runtime with Apache License 2.0

6 votes

public static void main(final String[] args) throws IOException {
    final Config options = PipelineOptionsFactory.fromArgs(args).as(Config.class);
    final Pipeline pipeline = Pipeline.create(options);
    try (final FileWriter writer = new FileWriter(options.getInputFile())) {
        writer.write("normal;6\nmarilyn;36");
    }

    final ComponentManager manager = ComponentManager.instance();
    pipeline.apply(TalendIO.read(manager.findMapper("sample", "reader", 1, new HashMap<String, String>() {

        {
            put("old_file", options.getInputFile()); // will be migrated to "file" with the migration handler
        }
    }).orElseThrow(() -> new IllegalArgumentException("No reader sample#reader, existing: " + manager.availablePlugins()))))
            .apply(new ViewsMappingTransform(emptyMap(), "sample"))
            .apply(TalendFn.asFn(manager.findProcessor("sample", "mapper", 1, emptyMap())
                    .orElseThrow(() -> new IllegalStateException("didn't find the processor"))))
            .apply(ParDo.of(new ToStringFn()))
            .apply(TextIO.write().to(ValueProvider.StaticValueProvider.of(options.getOutputFile())));
    final PipelineResult.State state = pipeline.run().waitUntilFinish();
    System.out.println(state);
}

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

6 votes

@Test
public void testInaccessibleProvider() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  Pipeline pipeline = Pipeline.create(options);
  DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options);

  pipeline.apply(TextIO.read().from(new TestValueProvider()));

  // Check that translation does not fail.
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  t.translate(
      pipeline,
      pipelineProto,
      sdkComponents,
      DataflowRunner.fromOptions(options),
      Collections.emptyList());
}

Source File: DatastoreToText.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF, and writes the JSON to TextIO sink.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(ReadJsonEntities.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}

Source File: MyBeamJob.java From hazelcast-jet-demos with Apache License 2.0

6 votes

public static Pipeline build(PipelineOptions pipelineOptions) {
	
    Pipeline pipeline = Pipeline.create(pipelineOptions);

	pipeline
	.apply("unbounded-source", 
			Read.from(new MyUnboundedSource("beam-input")))
    .apply("reformat-and-timestamp", 
    		ParDo.of(new MyEnrichAndReformatFn()))
	.apply("window",
			 Window.<String>into(FixedWindows.of(ONE_SECOND))
			 .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()))
			 .discardingFiredPanes()
			 .withAllowedLateness(ONE_SECOND)
			)
    .apply("sink",
    		FileIO.<String>write()
    		.via(TextIO.sink())
            .to(".")
            .withPrefix("beam-output")
            .withNumShards(1)
    		)
	;

    return pipeline;
}

Source File: BulkCompressorTest.java From DataflowTemplates with Apache License 2.0

6 votes

/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}

Source File: TfIdf.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<KV<URI, String>> expand(PBegin input) {
  Pipeline pipeline = input.getPipeline();

  // Create one TextIO.Read transform for each document
  // and add its output to a PCollectionList
  PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline);

  // TextIO.Read supports:
  //  - file: URIs and paths locally
  //  - gs: URIs on the service
  for (final URI uri : uris) {
    String uriString;
    if ("file".equals(uri.getScheme())) {
      uriString = new File(uri).getPath();
    } else {
      uriString = uri.toString();
    }

    PCollection<KV<URI, String>> oneUriToLines =
        pipeline
            .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString))
            .apply("WithKeys(" + uriString + ")", WithKeys.of(uri))
            .setCoder(KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of()));

    urisToLines = urisToLines.and(oneUriToLines);
  }

  return urisToLines.apply(Flatten.pCollections());
}

Source File: DistinctExample.java From beam with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);

    p.apply("ReadLines", TextIO.read().from(options.getInput()))
        .apply(Distinct.create())
        .apply("DedupedShakespeare", TextIO.write().to(options.getOutput()));

    p.run().waitUntilFinish();
  }

Source File: DisplayDataEvaluatorTest.java From beam with Apache License 2.0

5 votes

@Test
public void testSourceTransform() {
  PTransform<? super PBegin, ? extends POutput> myTransform = TextIO.read().from("foo.*");

  DisplayDataEvaluator evaluator = DisplayDataEvaluator.create();
  Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(myTransform);

  assertThat(displayData, hasItem(hasDisplayItem("filePattern", "foo.*")));
}

Source File: WordCount.java From twister2 with Apache License 2.0

5 votes

static void runWordCount(Twister2PipelineOptions options, String input, String output) {
  Pipeline p = Pipeline.create(options);

  // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
  // static FormatAsTextFn() to the ParDo transform.
  p.apply("ReadLines", TextIO.read().from(input))
      .apply(new CountWords())
      .apply(MapElements.via(new FormatAsTextFn()))
      .apply("WriteCounts", TextIO.write().to(output));

  p.run().waitUntilFinish();
}

Source File: FileIndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0

5 votes

/**
 * This function creates the DAG graph of transforms. It can be called from main()
 * as well as from the ControlPipeline.
 * @param options
 * @return
 * @throws Exception
 */
public static Pipeline createIndexerPipeline(FileIndexerPipelineOptions options) throws Exception {
	
    IndexerPipelineUtils.validateIndexerPipelineOptions(options);
	Pipeline pipeline = Pipeline.create(options);
	
	// PHASE: Read raw content from sources
	
	PCollection<InputContent> readContent = pipeline
			.apply("Read entire CSV file", org.apache.beam.sdk.io.Read.from(new RecordFileSource<String>(
				ValueProvider.StaticValueProvider.of(options.getInputFile()), 
				StringUtf8Coder.of(), RecordFileSource.DEFAULT_RECORD_SEPARATOR))) //
			.apply("Parse CSV file into InputContent objects", ParDo.of(new IndexerPipeline.ParseCSVFile()));
	
	// Define the accumulators of all filters
	PCollection<InputContent> contentToIndex = readContent;
	
	// PHASE: Index documents (extract opinions and entities/tags). 
	// Return successfully indexed docs, and create a Bigtable write transform to store errors 
	// in Dead Letter table.
	PCollection<ContentIndexSummary> indexes = indexDocuments(options, contentToIndex);
	
	if (options.getRatioEnrichWithCNLP() > 0)
		indexes = enrichWithCNLP(indexes, options.getRatioEnrichWithCNLP());
	
	// PHASE: Write to BigQuery
	// For the Indexes that are unique ("filteredIndexes"), create records in webresource, document, and sentiment.
	// Then, merge resulting webresources with webresourceRowsUnindexed and webresourceDeduped
	indexes
		.apply(ParDo.of(new CreateCSVLineFromIndexSummaryFn()))
		.apply(TextIO.write()
			.to(options.getOutputFile()));
	
	
	return pipeline;
}

Source File: TemplatePipeline.java From gcp-batch-ingestion-bigquery with Apache License 2.0

5 votes

public static void main(String[] args) {
    PipelineOptionsFactory.register(TemplateOptions.class);
    TemplateOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(TemplateOptions.class);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply("READ", TextIO.read().from(options.getInputFile()))
            .apply("TRANSFORM", ParDo.of(new WikiParDo()))
            .apply("WRITE", BigQueryIO.writeTableRows()
                    .to(String.format("%s:dotc_2018.wiki_demo", options.getProject()))
                    .withCreateDisposition(CREATE_IF_NEEDED)
                    .withWriteDisposition(WRITE_APPEND)
                    .withSchema(getTableSchema()));
    pipeline.run();
}

Source File: WindowingTest.java From beam with Apache License 2.0

5 votes

@Test
@Category(NeedsRunner.class)
public void testTextIoInput() throws Exception {
  File tmpFile = tmpFolder.newFile("file.txt");
  String filename = tmpFile.getPath();

  try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) {
    writer.println("a 1");
    writer.println("b 2");
    writer.println("b 3");
    writer.println("c 11");
    writer.println("d 11");
  }

  PCollection<String> output =
      p.begin()
          .apply("ReadLines", TextIO.read().from(filename))
          .apply(ParDo.of(new ExtractWordsWithTimestampsFn()))
          .apply(new WindowedCount(FixedWindows.of(Duration.millis(10))));

  PAssert.that(output)
      .containsInAnyOrder(
          output("a", 1, 1, 0, 10),
          output("b", 2, 2, 0, 10),
          output("c", 1, 11, 10, 20),
          output("d", 1, 11, 10, 20));

  p.run();
}

Source File: NameUtilsTest.java From beam with Apache License 2.0

5 votes

@Test
public void testPTransformName() {
  EmbeddedPTransform transform = new EmbeddedPTransform();
  assertEquals(
      "NameUtilsTest.EmbeddedPTransform",
      NameUtils.approximatePTransformName(transform.getClass()));
  assertEquals(
      "NameUtilsTest.EmbeddedPTransform",
      NameUtils.approximatePTransformName(transform.getBound().getClass()));
  assertEquals(
      "NameUtilsTest.SomeTransform",
      NameUtils.approximatePTransformName(AutoValue_NameUtilsTest_SomeTransform.class));
  assertEquals("TextIO.Write", NameUtils.approximatePTransformName(TextIO.Write.class));
}

Source File: GcsKmsKeyIT.java From beam with Apache License 2.0

5 votes

/**
 * Tests writing to tempLocation with --dataflowKmsKey set on the command line. Verifies that
 * resulting output uses specified key and is readable. Does not verify any temporary files.
 *
 * <p>This test verifies that GCS file copies work with CMEK-enabled files.
 */
@Test
public void testGcsWriteWithKmsKey() {
  TestPipelineOptions options =
      TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
  assertNotNull(options.getTempRoot());
  options.setTempLocation(options.getTempRoot() + "/testGcsWriteWithKmsKey");
  GcsOptions gcsOptions = options.as(GcsOptions.class);

  ResourceId filenamePrefix =
      FileSystems.matchNewResource(gcsOptions.getGcpTempLocation(), true)
          .resolve(
              String.format("GcsKmsKeyIT-%tF-%<tH-%<tM-%<tS-%<tL.output", new Date()),
              StandardResolveOptions.RESOLVE_FILE);

  Pipeline p = Pipeline.create(options);
  p.apply("ReadLines", TextIO.read().from(INPUT_FILE))
      .apply("WriteLines", TextIO.write().to(filenamePrefix));

  PipelineResult result = p.run();
  State state = result.waitUntilFinish();
  assertThat(state, equalTo(State.DONE));

  String filePattern = filenamePrefix + "*-of-*";
  assertThat(new NumberedShardedFile(filePattern), fileContentsHaveChecksum(EXPECTED_CHECKSUM));

  // Verify objects have KMS key set.
  try {
    MatchResult matchResult =
        Iterables.getOnlyElement(FileSystems.match(Collections.singletonList(filePattern)));
    GcsUtil gcsUtil = gcsOptions.getGcsUtil();
    for (Metadata metadata : matchResult.metadata()) {
      String kmsKey =
          gcsUtil.getObject(GcsPath.fromUri(metadata.resourceId().toString())).getKmsKeyName();
      assertNotNull(kmsKey);
    }
  } catch (IOException e) {
    throw new AssertionError(e);
  }
}

Source File: BeamInputTransform.java From hop with Apache License 2.0

5 votes

@Override public PCollection<HopRow> expand( PBegin input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamHop.init(transformPluginClasses, xpPluginClasses);

      // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)");

      TextIO.Read ioRead = TextIO.read()
        .from( inputLocation )
        .withCompression( Compression.UNCOMPRESSED )
        ;

      StringToHopFn stringToHopFn = new StringToHopFn( transformName, rowMetaJson, separator, transformPluginClasses, xpPluginClasses );

      PCollection<HopRow> output = input

        // We read a bunch of Strings, one per line basically
        //
        .apply( transformName + " READ FILE",  ioRead )

        // We need to transform these lines into Hop fields
        //
        .apply( transformName, ParDo.of( stringToHopFn ) );

      return output;

    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in beam input transform", e );
      throw new RuntimeException( "Error in beam input transform", e );
    }

  }

Source File: WordCount.java From beam-starter with Apache License 2.0

5 votes

public static void main(String[] args) {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
        .as(Options.class);
    options.setRunner(FlinkRunner.class);

    Pipeline p = Pipeline.create(options);

    p.apply("ReadLines", TextIO.Read.from(options.getInput()))
        .apply(new CountWords())
        .apply(MapElements.via(new FormatAsTextFn()))
        .apply("WriteCounts", TextIO.Write.to(options.getOutput()));

    p.run();
  }

org.apache.beam.sdk.io.TextIO Java Examples