org.apache.beam.sdk.io.TextIO Java Examples

The following examples show how to use org.apache.beam.sdk.io.TextIO. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WordCount.java    From java-docs-samples with Apache License 2.0 7 votes vote down vote up
public static void main(String[] args) {
  WordCountOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation().as(WordCountOptions.class);

  Pipeline pipeline = Pipeline.create(options);
  pipeline
      .apply("Read lines", TextIO.read().from(options.getInputFile()))
      // [END value_provider]
      .apply("Find words", FlatMapElements.into(TypeDescriptors.strings())
          .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
      .apply("Filter empty words", Filter.by((String word) -> !word.isEmpty()))
      .apply("Filter with substring", ParDo.of(new FilterWithSubstring(
          options.getWithSubstring(), options.getIsCaseSensitive())))
      .apply("Count words", Count.perElement())
      .apply("Format results", MapElements.into(TypeDescriptors.strings())
          .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
      // [START nested_value_provider]
      .apply("Write results", TextIO.write().to(NestedValueProvider.of(
          options.getOutputBucket(),
          (String bucket) -> String.format("gs://%s/samples/dataflow/wordcount/outputs", bucket)
      )));
      // [END nested_value_provider]
  pipeline.run();
}
 
Example #2
Source File: InvoicingPipeline.java    From nomulus with Apache License 2.0 6 votes vote down vote up
/** Returns an IO transform that writes the overall invoice to a single CSV file. */
private TextIO.Write writeInvoice(ValueProvider<String> yearMonthProvider) {
  return TextIO.write()
      .to(
          NestedValueProvider.of(
              yearMonthProvider,
              yearMonth ->
                  String.format(
                      "%s/%s/%s/%s-%s",
                      billingBucketUrl,
                      BillingModule.INVOICES_DIRECTORY,
                      yearMonth,
                      invoiceFilePrefix,
                      yearMonth)))
      .withHeader(InvoiceGroupingKey.invoiceHeader())
      .withoutSharding()
      .withSuffix(".csv");
}
 
Example #3
Source File: UserScore.java    From deployment-examples with MIT License 6 votes vote down vote up
/** Run a batch pipeline. */
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
  // Begin constructing a pipeline configured by commandline flags.
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline pipeline = Pipeline.create(options);

  // Read events from a text file and parse them.
  pipeline
      .apply(TextIO.read().from(options.getInput()))
      .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
      // Extract and sum username/score pairs from the event data.
      .apply("ExtractUserScore", new ExtractAndSumScore("user"))
      .apply(
          "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false));

  // Run the batch pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example #4
Source File: BeamSqlDataCatalogExample.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  LOG.info("Args: {}", Arrays.asList(args));
  DCExamplePipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).as(DCExamplePipelineOptions.class);
  LOG.info("Query: {}\nOutput: {}", options.getQueryString(), options.getOutputFilePrefix());

  Pipeline pipeline = Pipeline.create(options);

  validateArgs(options);

  try (DataCatalogTableProvider tableProvider =
      DataCatalogTableProvider.create(options.as(DataCatalogPipelineOptions.class))) {
    pipeline
        .apply(
            "SQL Query",
            SqlTransform.query(options.getQueryString())
                .withDefaultTableProvider("datacatalog", tableProvider))
        .apply("Convert to Strings", rowsToStrings())
        .apply("Write output", TextIO.write().to(options.getOutputFilePrefix()));

    pipeline.run().waitUntilFinish();
  }
}
 
Example #5
Source File: TaskTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void textIO() {
  PCollection<String> countries =
      testPipeline.apply(TextIO.read().from("countries.txt"));

  PCollection<String> results = Task.applyTransform(countries);

  PAssert.that(results)
      .containsInAnyOrder(
          "AUSTRALIA",
          "CHINA",
          "ENGLAND",
          "FRANCE",
          "GERMANY",
          "INDONESIA",
          "JAPAN",
          "MEXICO",
          "SINGAPORE",
          "UNITED STATES"
      );

  testPipeline.run().waitUntilFinish();
}
 
Example #6
Source File: DataflowRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Tests that all reads are consumed by at least one {@link PTransform}. */
@Test
public void testUnconsumedReads() throws IOException {
  DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
  RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
  Pipeline p = buildDataflowPipeline(dataflowOptions);
  p.apply(TextIO.read().from(options.getInput()));
  DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
  final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
  p.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumedSeenAsInput.set(true);
        }
      });
  assertThat(unconsumedSeenAsInput.get(), is(true));
}
 
Example #7
Source File: NumShardsTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testText() throws Exception {
  PCollection<String> inputWords = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
  PCollection<String> output =
      inputWords
          .apply(new WordCount.CountWords())
          .apply(MapElements.via(new WordCount.FormatAsTextFn()));
  output.apply(
      TextIO.write().to(outputDir.getAbsolutePath()).withNumShards(3).withSuffix(".txt"));
  p.run().waitUntilFinish();

  int count = 0;
  Set<String> expected = Sets.newHashSet("hi: 5", "there: 1", "sue: 2", "bob: 2");
  for (File f :
      tmpDir.getRoot().listFiles(pathname -> pathname.getName().matches("out-.*\\.txt"))) {
    count++;
    for (String line : Files.readLines(f, StandardCharsets.UTF_8)) {
      assertTrue(line + " not found", expected.remove(line));
    }
  }
  assertEquals(3, count);
  assertTrue(expected.toString(), expected.isEmpty());
}
 
Example #8
Source File: DebuggingWordCount.java    From deployment-examples with MIT License 6 votes vote down vote up
static void runDebuggingWordCount(WordCountOptions options) {
  Pipeline p = Pipeline.create(options);

  PCollection<KV<String, Long>> filteredWords =
      p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
          .apply(new WordCount.CountWords())
          .apply(ParDo.of(new FilterTextFn(options.getFilterPattern())));

  /*
   * Concept #3: PAssert is a set of convenient PTransforms in the style of
   * Hamcrest's collection matchers that can be used when writing Pipeline level tests
   * to validate the contents of PCollections. PAssert is best used in unit tests
   * with small data sets but is demonstrated here as a teaching tool.
   *
   * <p>Below we verify that the set of filtered words matches our expected counts. Note
   * that PAssert does not provide any output and that successful completion of the
   * Pipeline implies that the expectations were met. Learn more at
   * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test
   * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
   */
  List<KV<String, Long>> expectedResults =
      Arrays.asList(KV.of("Flourish", 3L), KV.of("stomach", 1L));
  PAssert.that(filteredWords).containsInAnyOrder(expectedResults);

  p.run().waitUntilFinish();
}
 
Example #9
Source File: CsvToAvro.java    From java-docs-samples with Apache License 2.0 6 votes vote down vote up
public static void runCsvToAvro(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert CSV to Avro
  pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data",
          ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema))
      .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
          .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example #10
Source File: UserScore.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Run a batch pipeline. */
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
  // Begin constructing a pipeline configured by commandline flags.
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline pipeline = Pipeline.create(options);

  // Read events from a text file and parse them.
  pipeline
      .apply(TextIO.read().from(options.getInput()))
      .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
      // Extract and sum username/score pairs from the event data.
      .apply("ExtractUserScore", new ExtractAndSumScore("user"))
      .apply(
          "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false));

  // Run the batch pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example #11
Source File: CsvConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}
 
Example #12
Source File: TextToPubsubStream.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
    .apply(
      "Read Text Data",
      TextIO.read()
        .from(options.getInputFilePattern())
        .watchForNewFiles(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
    .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example #13
Source File: DatastoreSchemasCountToText.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from datastore, parses the Entity's schema,
 * and counts the unique number of schemas.
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreSchemaCountToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreSchemaCountToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(DatastoreReadSchemaCount.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}
 
Example #14
Source File: TextToPubsub.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
      .apply("Read Text Data", TextIO.read().from(options.getInputFilePattern()))
      .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example #15
Source File: Main.java    From component-runtime with Apache License 2.0 6 votes vote down vote up
public static void main(final String[] args) throws IOException {
    final Config options = PipelineOptionsFactory.fromArgs(args).as(Config.class);
    final Pipeline pipeline = Pipeline.create(options);
    try (final FileWriter writer = new FileWriter(options.getInputFile())) {
        writer.write("normal;6\nmarilyn;36");
    }

    final ComponentManager manager = ComponentManager.instance();
    pipeline.apply(TalendIO.read(manager.findMapper("sample", "reader", 1, new HashMap<String, String>() {

        {
            put("old_file", options.getInputFile()); // will be migrated to "file" with the migration handler
        }
    }).orElseThrow(() -> new IllegalArgumentException("No reader sample#reader, existing: " + manager.availablePlugins()))))
            .apply(new ViewsMappingTransform(emptyMap(), "sample"))
            .apply(TalendFn.asFn(manager.findProcessor("sample", "mapper", 1, emptyMap())
                    .orElseThrow(() -> new IllegalStateException("didn't find the processor"))))
            .apply(ParDo.of(new ToStringFn()))
            .apply(TextIO.write().to(ValueProvider.StaticValueProvider.of(options.getOutputFile())));
    final PipelineResult.State state = pipeline.run().waitUntilFinish();
    System.out.println(state);
}
 
Example #16
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testInaccessibleProvider() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  Pipeline pipeline = Pipeline.create(options);
  DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options);

  pipeline.apply(TextIO.read().from(new TestValueProvider()));

  // Check that translation does not fail.
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  t.translate(
      pipeline,
      pipelineProto,
      sdkComponents,
      DataflowRunner.fromOptions(options),
      Collections.emptyList());
}
 
Example #17
Source File: DatastoreToText.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF, and writes the JSON to TextIO sink.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(ReadJsonEntities.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}
 
Example #18
Source File: MyBeamJob.java    From hazelcast-jet-demos with Apache License 2.0 6 votes vote down vote up
public static Pipeline build(PipelineOptions pipelineOptions) {
	
    Pipeline pipeline = Pipeline.create(pipelineOptions);

	pipeline
	.apply("unbounded-source", 
			Read.from(new MyUnboundedSource("beam-input")))
    .apply("reformat-and-timestamp", 
    		ParDo.of(new MyEnrichAndReformatFn()))
	.apply("window",
			 Window.<String>into(FixedWindows.of(ONE_SECOND))
			 .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()))
			 .discardingFiredPanes()
			 .withAllowedLateness(ONE_SECOND)
			)
    .apply("sink",
    		FileIO.<String>write()
    		.via(TextIO.sink())
            .to(".")
            .withPrefix("beam-output")
            .withNumShards(1)
    		)
	;

    return pipeline;
}
 
Example #19
Source File: BulkCompressorTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}
 
Example #20
Source File: TfIdf.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<URI, String>> expand(PBegin input) {
  Pipeline pipeline = input.getPipeline();

  // Create one TextIO.Read transform for each document
  // and add its output to a PCollectionList
  PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline);

  // TextIO.Read supports:
  //  - file: URIs and paths locally
  //  - gs: URIs on the service
  for (final URI uri : uris) {
    String uriString;
    if ("file".equals(uri.getScheme())) {
      uriString = new File(uri).getPath();
    } else {
      uriString = uri.toString();
    }

    PCollection<KV<URI, String>> oneUriToLines =
        pipeline
            .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString))
            .apply("WithKeys(" + uriString + ")", WithKeys.of(uri))
            .setCoder(KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of()));

    urisToLines = urisToLines.and(oneUriToLines);
  }

  return urisToLines.apply(Flatten.pCollections());
}
 
Example #21
Source File: DistinctExample.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);

    p.apply("ReadLines", TextIO.read().from(options.getInput()))
        .apply(Distinct.create())
        .apply("DedupedShakespeare", TextIO.write().to(options.getOutput()));

    p.run().waitUntilFinish();
  }
 
Example #22
Source File: DisplayDataEvaluatorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSourceTransform() {
  PTransform<? super PBegin, ? extends POutput> myTransform = TextIO.read().from("foo.*");

  DisplayDataEvaluator evaluator = DisplayDataEvaluator.create();
  Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(myTransform);

  assertThat(displayData, hasItem(hasDisplayItem("filePattern", "foo.*")));
}
 
Example #23
Source File: WordCount.java    From twister2 with Apache License 2.0 5 votes vote down vote up
static void runWordCount(Twister2PipelineOptions options, String input, String output) {
  Pipeline p = Pipeline.create(options);

  // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
  // static FormatAsTextFn() to the ParDo transform.
  p.apply("ReadLines", TextIO.read().from(input))
      .apply(new CountWords())
      .apply(MapElements.via(new FormatAsTextFn()))
      .apply("WriteCounts", TextIO.write().to(output));

  p.run().waitUntilFinish();
}
 
Example #24
Source File: FileIndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * This function creates the DAG graph of transforms. It can be called from main()
 * as well as from the ControlPipeline.
 * @param options
 * @return
 * @throws Exception
 */
public static Pipeline createIndexerPipeline(FileIndexerPipelineOptions options) throws Exception {
	
    IndexerPipelineUtils.validateIndexerPipelineOptions(options);
	Pipeline pipeline = Pipeline.create(options);
	
	// PHASE: Read raw content from sources
	
	PCollection<InputContent> readContent = pipeline
			.apply("Read entire CSV file", org.apache.beam.sdk.io.Read.from(new RecordFileSource<String>(
				ValueProvider.StaticValueProvider.of(options.getInputFile()), 
				StringUtf8Coder.of(), RecordFileSource.DEFAULT_RECORD_SEPARATOR))) //
			.apply("Parse CSV file into InputContent objects", ParDo.of(new IndexerPipeline.ParseCSVFile()));
	
	// Define the accumulators of all filters
	PCollection<InputContent> contentToIndex = readContent;
	
	// PHASE: Index documents (extract opinions and entities/tags). 
	// Return successfully indexed docs, and create a Bigtable write transform to store errors 
	// in Dead Letter table.
	PCollection<ContentIndexSummary> indexes = indexDocuments(options, contentToIndex);
	
	if (options.getRatioEnrichWithCNLP() > 0)
		indexes = enrichWithCNLP(indexes, options.getRatioEnrichWithCNLP());
	
	// PHASE: Write to BigQuery
	// For the Indexes that are unique ("filteredIndexes"), create records in webresource, document, and sentiment.
	// Then, merge resulting webresources with webresourceRowsUnindexed and webresourceDeduped
	indexes
		.apply(ParDo.of(new CreateCSVLineFromIndexSummaryFn()))
		.apply(TextIO.write()
			.to(options.getOutputFile()));
	
	
	return pipeline;
}
 
Example #25
Source File: TemplatePipeline.java    From gcp-batch-ingestion-bigquery with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    PipelineOptionsFactory.register(TemplateOptions.class);
    TemplateOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(TemplateOptions.class);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply("READ", TextIO.read().from(options.getInputFile()))
            .apply("TRANSFORM", ParDo.of(new WikiParDo()))
            .apply("WRITE", BigQueryIO.writeTableRows()
                    .to(String.format("%s:dotc_2018.wiki_demo", options.getProject()))
                    .withCreateDisposition(CREATE_IF_NEEDED)
                    .withWriteDisposition(WRITE_APPEND)
                    .withSchema(getTableSchema()));
    pipeline.run();
}
 
Example #26
Source File: WindowingTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testTextIoInput() throws Exception {
  File tmpFile = tmpFolder.newFile("file.txt");
  String filename = tmpFile.getPath();

  try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) {
    writer.println("a 1");
    writer.println("b 2");
    writer.println("b 3");
    writer.println("c 11");
    writer.println("d 11");
  }

  PCollection<String> output =
      p.begin()
          .apply("ReadLines", TextIO.read().from(filename))
          .apply(ParDo.of(new ExtractWordsWithTimestampsFn()))
          .apply(new WindowedCount(FixedWindows.of(Duration.millis(10))));

  PAssert.that(output)
      .containsInAnyOrder(
          output("a", 1, 1, 0, 10),
          output("b", 2, 2, 0, 10),
          output("c", 1, 11, 10, 20),
          output("d", 1, 11, 10, 20));

  p.run();
}
 
Example #27
Source File: NameUtilsTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testPTransformName() {
  EmbeddedPTransform transform = new EmbeddedPTransform();
  assertEquals(
      "NameUtilsTest.EmbeddedPTransform",
      NameUtils.approximatePTransformName(transform.getClass()));
  assertEquals(
      "NameUtilsTest.EmbeddedPTransform",
      NameUtils.approximatePTransformName(transform.getBound().getClass()));
  assertEquals(
      "NameUtilsTest.SomeTransform",
      NameUtils.approximatePTransformName(AutoValue_NameUtilsTest_SomeTransform.class));
  assertEquals("TextIO.Write", NameUtils.approximatePTransformName(TextIO.Write.class));
}
 
Example #28
Source File: GcsKmsKeyIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests writing to tempLocation with --dataflowKmsKey set on the command line. Verifies that
 * resulting output uses specified key and is readable. Does not verify any temporary files.
 *
 * <p>This test verifies that GCS file copies work with CMEK-enabled files.
 */
@Test
public void testGcsWriteWithKmsKey() {
  TestPipelineOptions options =
      TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
  assertNotNull(options.getTempRoot());
  options.setTempLocation(options.getTempRoot() + "/testGcsWriteWithKmsKey");
  GcsOptions gcsOptions = options.as(GcsOptions.class);

  ResourceId filenamePrefix =
      FileSystems.matchNewResource(gcsOptions.getGcpTempLocation(), true)
          .resolve(
              String.format("GcsKmsKeyIT-%tF-%<tH-%<tM-%<tS-%<tL.output", new Date()),
              StandardResolveOptions.RESOLVE_FILE);

  Pipeline p = Pipeline.create(options);
  p.apply("ReadLines", TextIO.read().from(INPUT_FILE))
      .apply("WriteLines", TextIO.write().to(filenamePrefix));

  PipelineResult result = p.run();
  State state = result.waitUntilFinish();
  assertThat(state, equalTo(State.DONE));

  String filePattern = filenamePrefix + "*-of-*";
  assertThat(new NumberedShardedFile(filePattern), fileContentsHaveChecksum(EXPECTED_CHECKSUM));

  // Verify objects have KMS key set.
  try {
    MatchResult matchResult =
        Iterables.getOnlyElement(FileSystems.match(Collections.singletonList(filePattern)));
    GcsUtil gcsUtil = gcsOptions.getGcsUtil();
    for (Metadata metadata : matchResult.metadata()) {
      String kmsKey =
          gcsUtil.getObject(GcsPath.fromUri(metadata.resourceId().toString())).getKmsKeyName();
      assertNotNull(kmsKey);
    }
  } catch (IOException e) {
    throw new AssertionError(e);
  }
}
 
Example #29
Source File: BeamInputTransform.java    From hop with Apache License 2.0 5 votes vote down vote up
@Override public PCollection<HopRow> expand( PBegin input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamHop.init(transformPluginClasses, xpPluginClasses);

      // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)");

      TextIO.Read ioRead = TextIO.read()
        .from( inputLocation )
        .withCompression( Compression.UNCOMPRESSED )
        ;

      StringToHopFn stringToHopFn = new StringToHopFn( transformName, rowMetaJson, separator, transformPluginClasses, xpPluginClasses );

      PCollection<HopRow> output = input

        // We read a bunch of Strings, one per line basically
        //
        .apply( transformName + " READ FILE",  ioRead )

        // We need to transform these lines into Hop fields
        //
        .apply( transformName, ParDo.of( stringToHopFn ) );

      return output;

    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in beam input transform", e );
      throw new RuntimeException( "Error in beam input transform", e );
    }

  }
 
Example #30
Source File: WordCount.java    From beam-starter with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
        .as(Options.class);
    options.setRunner(FlinkRunner.class);

    Pipeline p = Pipeline.create(options);

    p.apply("ReadLines", TextIO.Read.from(options.getInput()))
        .apply(new CountWords())
        .apply(MapElements.via(new FormatAsTextFn()))
        .apply("WriteCounts", TextIO.Write.to(options.getOutput()));

    p.run();
  }