Java Code Examples for org.apache.beam.sdk.io.TextIO

The following examples show how to use org.apache.beam.sdk.io.TextIO. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: java-docs-samples   Source File: WordCount.java    License: Apache License 2.0 7 votes vote down vote up
public static void main(String[] args) {
  WordCountOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation().as(WordCountOptions.class);

  Pipeline pipeline = Pipeline.create(options);
  pipeline
      .apply("Read lines", TextIO.read().from(options.getInputFile()))
      // [END value_provider]
      .apply("Find words", FlatMapElements.into(TypeDescriptors.strings())
          .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
      .apply("Filter empty words", Filter.by((String word) -> !word.isEmpty()))
      .apply("Filter with substring", ParDo.of(new FilterWithSubstring(
          options.getWithSubstring(), options.getIsCaseSensitive())))
      .apply("Count words", Count.perElement())
      .apply("Format results", MapElements.into(TypeDescriptors.strings())
          .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
      // [START nested_value_provider]
      .apply("Write results", TextIO.write().to(NestedValueProvider.of(
          options.getOutputBucket(),
          (String bucket) -> String.format("gs://%s/samples/dataflow/wordcount/outputs", bucket)
      )));
      // [END nested_value_provider]
  pipeline.run();
}
 
Example 2
Source Project: deployment-examples   Source File: UserScore.java    License: MIT License 6 votes vote down vote up
/** Run a batch pipeline. */
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
  // Begin constructing a pipeline configured by commandline flags.
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline pipeline = Pipeline.create(options);

  // Read events from a text file and parse them.
  pipeline
      .apply(TextIO.read().from(options.getInput()))
      .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
      // Extract and sum username/score pairs from the event data.
      .apply("ExtractUserScore", new ExtractAndSumScore("user"))
      .apply(
          "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false));

  // Run the batch pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example 3
Source Project: deployment-examples   Source File: DebuggingWordCount.java    License: MIT License 6 votes vote down vote up
static void runDebuggingWordCount(WordCountOptions options) {
  Pipeline p = Pipeline.create(options);

  PCollection<KV<String, Long>> filteredWords =
      p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
          .apply(new WordCount.CountWords())
          .apply(ParDo.of(new FilterTextFn(options.getFilterPattern())));

  /*
   * Concept #3: PAssert is a set of convenient PTransforms in the style of
   * Hamcrest's collection matchers that can be used when writing Pipeline level tests
   * to validate the contents of PCollections. PAssert is best used in unit tests
   * with small data sets but is demonstrated here as a teaching tool.
   *
   * <p>Below we verify that the set of filtered words matches our expected counts. Note
   * that PAssert does not provide any output and that successful completion of the
   * Pipeline implies that the expectations were met. Learn more at
   * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test
   * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
   */
  List<KV<String, Long>> expectedResults =
      Arrays.asList(KV.of("Flourish", 3L), KV.of("stomach", 1L));
  PAssert.that(filteredWords).containsInAnyOrder(expectedResults);

  p.run().waitUntilFinish();
}
 
Example 4
Source Project: DataflowTemplates   Source File: CsvConverters.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}
 
Example 5
Source Project: DataflowTemplates   Source File: TextToPubsubStream.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
    .apply(
      "Read Text Data",
      TextIO.read()
        .from(options.getInputFilePattern())
        .watchForNewFiles(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
    .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example 6
/**
 * Runs a pipeline which reads in Entities from datastore, parses the Entity's schema,
 * and counts the unique number of schemas.
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreSchemaCountToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreSchemaCountToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(DatastoreReadSchemaCount.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}
 
Example 7
Source Project: DataflowTemplates   Source File: TextToPubsub.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
      .apply("Read Text Data", TextIO.read().from(options.getInputFilePattern()))
      .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example 8
Source Project: DataflowTemplates   Source File: DatastoreToText.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF, and writes the JSON to TextIO sink.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(ReadJsonEntities.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}
 
Example 9
Source Project: DataflowTemplates   Source File: BulkCompressorTest.java    License: Apache License 2.0 6 votes vote down vote up
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}
 
Example 10
Source Project: nomulus   Source File: InvoicingPipeline.java    License: Apache License 2.0 6 votes vote down vote up
/** Returns an IO transform that writes the overall invoice to a single CSV file. */
private TextIO.Write writeInvoice(ValueProvider<String> yearMonthProvider) {
  return TextIO.write()
      .to(
          NestedValueProvider.of(
              yearMonthProvider,
              yearMonth ->
                  String.format(
                      "%s/%s/%s/%s-%s",
                      billingBucketUrl,
                      BillingModule.INVOICES_DIRECTORY,
                      yearMonth,
                      invoiceFilePrefix,
                      yearMonth)))
      .withHeader(InvoiceGroupingKey.invoiceHeader())
      .withoutSharding()
      .withSuffix(".csv");
}
 
Example 11
Source Project: hazelcast-jet-demos   Source File: MyBeamJob.java    License: Apache License 2.0 6 votes vote down vote up
public static Pipeline build(PipelineOptions pipelineOptions) {
	
    Pipeline pipeline = Pipeline.create(pipelineOptions);

	pipeline
	.apply("unbounded-source", 
			Read.from(new MyUnboundedSource("beam-input")))
    .apply("reformat-and-timestamp", 
    		ParDo.of(new MyEnrichAndReformatFn()))
	.apply("window",
			 Window.<String>into(FixedWindows.of(ONE_SECOND))
			 .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()))
			 .discardingFiredPanes()
			 .withAllowedLateness(ONE_SECOND)
			)
    .apply("sink",
    		FileIO.<String>write()
    		.via(TextIO.sink())
            .to(".")
            .withPrefix("beam-output")
            .withNumShards(1)
    		)
	;

    return pipeline;
}
 
Example 12
Source Project: beam   Source File: DataflowPipelineTranslatorTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testInaccessibleProvider() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  Pipeline pipeline = Pipeline.create(options);
  DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options);

  pipeline.apply(TextIO.read().from(new TestValueProvider()));

  // Check that translation does not fail.
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  t.translate(
      pipeline,
      pipelineProto,
      sdkComponents,
      DataflowRunner.fromOptions(options),
      Collections.emptyList());
}
 
Example 13
Source Project: component-runtime   Source File: Main.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(final String[] args) throws IOException {
    final Config options = PipelineOptionsFactory.fromArgs(args).as(Config.class);
    final Pipeline pipeline = Pipeline.create(options);
    try (final FileWriter writer = new FileWriter(options.getInputFile())) {
        writer.write("normal;6\nmarilyn;36");
    }

    final ComponentManager manager = ComponentManager.instance();
    pipeline.apply(TalendIO.read(manager.findMapper("sample", "reader", 1, new HashMap<String, String>() {

        {
            put("old_file", options.getInputFile()); // will be migrated to "file" with the migration handler
        }
    }).orElseThrow(() -> new IllegalArgumentException("No reader sample#reader, existing: " + manager.availablePlugins()))))
            .apply(new ViewsMappingTransform(emptyMap(), "sample"))
            .apply(TalendFn.asFn(manager.findProcessor("sample", "mapper", 1, emptyMap())
                    .orElseThrow(() -> new IllegalStateException("didn't find the processor"))))
            .apply(ParDo.of(new ToStringFn()))
            .apply(TextIO.write().to(ValueProvider.StaticValueProvider.of(options.getOutputFile())));
    final PipelineResult.State state = pipeline.run().waitUntilFinish();
    System.out.println(state);
}
 
Example 14
Source Project: beam   Source File: UserScore.java    License: Apache License 2.0 6 votes vote down vote up
/** Run a batch pipeline. */
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
  // Begin constructing a pipeline configured by commandline flags.
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline pipeline = Pipeline.create(options);

  // Read events from a text file and parse them.
  pipeline
      .apply(TextIO.read().from(options.getInput()))
      .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
      // Extract and sum username/score pairs from the event data.
      .apply("ExtractUserScore", new ExtractAndSumScore("user"))
      .apply(
          "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false));

  // Run the batch pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example 15
Source Project: java-docs-samples   Source File: CsvToAvro.java    License: Apache License 2.0 6 votes vote down vote up
public static void runCsvToAvro(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert CSV to Avro
  pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data",
          ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema))
      .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
          .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example 16
Source Project: beam   Source File: DataflowRunnerTest.java    License: Apache License 2.0 6 votes vote down vote up
/** Tests that all reads are consumed by at least one {@link PTransform}. */
@Test
public void testUnconsumedReads() throws IOException {
  DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
  RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
  Pipeline p = buildDataflowPipeline(dataflowOptions);
  p.apply(TextIO.read().from(options.getInput()));
  DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
  final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
  p.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumedSeenAsInput.set(true);
        }
      });
  assertThat(unconsumedSeenAsInput.get(), is(true));
}
 
Example 17
Source Project: beam   Source File: TaskTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void textIO() {
  PCollection<String> countries =
      testPipeline.apply(TextIO.read().from("countries.txt"));

  PCollection<String> results = Task.applyTransform(countries);

  PAssert.that(results)
      .containsInAnyOrder(
          "AUSTRALIA",
          "CHINA",
          "ENGLAND",
          "FRANCE",
          "GERMANY",
          "INDONESIA",
          "JAPAN",
          "MEXICO",
          "SINGAPORE",
          "UNITED STATES"
      );

  testPipeline.run().waitUntilFinish();
}
 
Example 18
Source Project: beam   Source File: BeamSqlDataCatalogExample.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  LOG.info("Args: {}", Arrays.asList(args));
  DCExamplePipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).as(DCExamplePipelineOptions.class);
  LOG.info("Query: {}\nOutput: {}", options.getQueryString(), options.getOutputFilePrefix());

  Pipeline pipeline = Pipeline.create(options);

  validateArgs(options);

  try (DataCatalogTableProvider tableProvider =
      DataCatalogTableProvider.create(options.as(DataCatalogPipelineOptions.class))) {
    pipeline
        .apply(
            "SQL Query",
            SqlTransform.query(options.getQueryString())
                .withDefaultTableProvider("datacatalog", tableProvider))
        .apply("Convert to Strings", rowsToStrings())
        .apply("Write output", TextIO.write().to(options.getOutputFilePrefix()));

    pipeline.run().waitUntilFinish();
  }
}
 
Example 19
Source Project: beam   Source File: NumShardsTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testText() throws Exception {
  PCollection<String> inputWords = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
  PCollection<String> output =
      inputWords
          .apply(new WordCount.CountWords())
          .apply(MapElements.via(new WordCount.FormatAsTextFn()));
  output.apply(
      TextIO.write().to(outputDir.getAbsolutePath()).withNumShards(3).withSuffix(".txt"));
  p.run().waitUntilFinish();

  int count = 0;
  Set<String> expected = Sets.newHashSet("hi: 5", "there: 1", "sue: 2", "bob: 2");
  for (File f :
      tmpDir.getRoot().listFiles(pathname -> pathname.getName().matches("out-.*\\.txt"))) {
    count++;
    for (String line : Files.readLines(f, StandardCharsets.UTF_8)) {
      assertTrue(line + " not found", expected.remove(line));
    }
  }
  assertEquals(3, count);
  assertTrue(expected.toString(), expected.isEmpty());
}
 
Example 20
Source Project: hop   Source File: BeamInputTransform.java    License: Apache License 2.0 5 votes vote down vote up
@Override public PCollection<HopRow> expand( PBegin input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamHop.init(transformPluginClasses, xpPluginClasses);

      // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)");

      TextIO.Read ioRead = TextIO.read()
        .from( inputLocation )
        .withCompression( Compression.UNCOMPRESSED )
        ;

      StringToHopFn stringToHopFn = new StringToHopFn( transformName, rowMetaJson, separator, transformPluginClasses, xpPluginClasses );

      PCollection<HopRow> output = input

        // We read a bunch of Strings, one per line basically
        //
        .apply( transformName + " READ FILE",  ioRead )

        // We need to transform these lines into Hop fields
        //
        .apply( transformName, ParDo.of( stringToHopFn ) );

      return output;

    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in beam input transform", e );
      throw new RuntimeException( "Error in beam input transform", e );
    }

  }
 
Example 21
@Override
public PDone expand(PCollection<KV<String, String>> input) {

  PCollection<String> contents =
      input.apply(
          ParDo.of(
              new DoFn<KV<String, String>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  filenamePrefix = String.format("%s%s", filenamePrefix, c.element().getKey());
                  LOG.info("File Prefix {}", filenamePrefix);

                  c.output(c.element().getValue());
                }
              }));

  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites();

  if (numShards != null) {
    write = write.withNumShards(numShards);
  }

  return contents.apply(write);
}
 
Example 22
@Override
public PDone expand(PCollection<String> input) {

  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites();

  if (numShards != null) {
    write = write.withNumShards(numShards);
  }

  return input.apply(write);
}
 
Example 23
Source Project: streamingbook   Source File: Example2_1.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
Options options = PipelineOptionsFactory
    .fromArgs(args)
    .withValidation()
    .as(Options.class);
Pipeline pipeline = Pipeline.create(options);

pipeline
    .apply("Read", TextIO.read().from(options.getInputFile()))
    .apply("Parse", ParDo.of(new ParseFn()))
    .apply("Example 2-1", new BeamModel.Example2_1())
    .apply("Write", TextIO.write().to(options.getOutput()));

pipeline.run().waitUntilFinish();
   }
 
Example 24
Source Project: java-docs-samples   Source File: CsvToAvroTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testCsvToAvro() throws Exception {
  SampleOptions options = TestPipeline.testingPipelineOptions().as(SampleOptions.class);
  options.setAvroSchema("gs://cloud-samples-data/storage/transformations/user.avsc");
  options.setInputFile("gs://cloud-samples-data/storage/transformations/input.csv");

  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  final List<GenericRecord> expectedResult = new ArrayList<>();
  GenericRecord genericRecordOne = new GenericData.Record(schema);
  genericRecordOne.put("first_name", "frank");
  genericRecordOne.put("last_name", "natividad");
  genericRecordOne.put("age", 1);
  expectedResult.add(genericRecordOne);
  GenericRecord genericRecordTwo = new GenericData.Record(schema);
  genericRecordTwo.put("first_name", "Karthi");
  genericRecordTwo.put("last_name", "thyagarajan");
  genericRecordTwo.put("age", 3);
  expectedResult.add(genericRecordTwo);

  final PCollection<GenericRecord> avroDataCollection = pipeline.apply("Read CSV files",
      TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data", ParDo.of(
          new CsvToAvro.ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema));

  PAssert.that(avroDataCollection).containsInAnyOrder(expectedResult);

  pipeline.run().waitUntilFinish();
}
 
Example 25
Source Project: incubator-nemo   Source File: WriteOneFilePerWindow.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(final PCollection<String> input) {
  final ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
    TextIO.write()
      .to(new PerWindowFiles(resource))
      .withTempDirectory(resource.getCurrentDirectory())
      .withWindowedWrites();
  if (numShards != null) {
    write = write.withNumShards(numShards);
  }
  return input.apply(write);
}
 
Example 26
Source Project: incubator-nemo   Source File: GenericSourceSink.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Write data.
 * NEMO-365: This method could later be replaced using the HadoopFormatIO class.
 *
 * @param dataToWrite data to write
 * @param path        path to write data
 * @return returns {@link PDone}
 */
public static PDone write(final PCollection<String> dataToWrite,
                          final String path) {
  if (isHDFSPath(path)) {
    dataToWrite.apply(ParDo.of(new HDFSWrite(path)));
    return PDone.in(dataToWrite.getPipeline());
  } else {
    return dataToWrite.apply(TextIO.write().to(path));
  }
}
 
Example 27
Source Project: beam   Source File: WordCount.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  WordCountOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
  Pipeline p = Pipeline.create(options);

  // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
  // static FormatAsTextFn() to the ParDo transform.
  p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
      .apply(new CountWords())
      .apply(MapElements.via(new FormatAsTextFn()))
      .apply("WriteCounts", TextIO.write().to(options.getOutput()));

  p.run().waitUntilFinish();
}
 
Example 28
Source Project: gcp-ingestion   Source File: Write.java    License: Mozilla Public License 2.0 5 votes vote down vote up
@Override
public WithFailures.Result<PDone, PubsubMessage> expand(PCollection<PubsubMessage> input) {
  ValueProvider<DynamicPathTemplate> pathTemplate = NestedValueProvider.of(outputPrefix,
      DynamicPathTemplate::new);
  ValueProvider<String> staticPrefix = NestedValueProvider.of(pathTemplate,
      value -> value.staticPrefix);

  FileIO.Write<List<String>, PubsubMessage> write = FileIO
      .<List<String>, PubsubMessage>writeDynamic()
      // We can't pass the attribute map to by() directly since MapCoder isn't
      // deterministic;
      // instead, we extract an ordered list of the needed placeholder values.
      // That list is later available to withNaming() to determine output location.
      .by(message -> pathTemplate.get()
          .extractValuesFrom(DerivedAttributesMap.of(message.getAttributeMap())))
      .withDestinationCoder(ListCoder.of(StringUtf8Coder.of())) //
      .withCompression(compression) //
      .via(Contextful.fn(format::encodeSingleMessage), TextIO.sink()) //
      .to(staticPrefix) //
      .withNaming(placeholderValues -> NoColonFileNaming.defaultNaming(
          pathTemplate.get().replaceDynamicPart(placeholderValues), format.suffix()));

  if (inputType == InputType.pubsub) {
    // Passing a ValueProvider to withNumShards disables runner-determined sharding, so we
    // need to be careful to pass this only for streaming input (where runner-determined
    // sharding is not an option).
    write = write.withNumShards(numShards);
  }

  input //
      .apply(Window.<PubsubMessage>into(FixedWindows.of(windowDuration))
          // We allow lateness up to the maximum Cloud Pub/Sub retention of 7 days documented in
          // https://cloud.google.com/pubsub/docs/subscriber
          .withAllowedLateness(Duration.standardDays(7)) //
          .discardingFiredPanes())
      .apply(write);
  return WithFailures.Result.of(PDone.in(input.getPipeline()),
      EmptyErrors.in(input.getPipeline()));
}
 
Example 29
Source Project: deployment-examples   Source File: WriteToText.java    License: MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<String> input) {
  // Verify that the input has a compatible window type.
  checkArgument(
      input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder());

  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);

  return input.apply(
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites()
          .withNumShards(3));
}
 
Example 30
Source Project: deployment-examples   Source File: WriteToText.java    License: MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  if (windowed) {
    teamAndScore
        .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
        .apply(new WriteToText.WriteOneFilePerWindow(filenamePrefix));
  } else {
    teamAndScore
        .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
        .apply(TextIO.write().to(filenamePrefix));
  }
  return PDone.in(teamAndScore.getPipeline());
}