Java Code Examples for org.apache.beam.sdk.io.FileIO

The following examples show how to use org.apache.beam.sdk.io.FileIO. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: DataflowTemplates   Source File: WriteToGCSParquet.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public WriteFilesResult<Void> expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as parquet file using {@link FileIO} and {@link ParquetIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       */
      .apply(
          "Writing as Parquet",
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(KeyValueToGenericRecordFn.SCHEMA))
              .to(outputDirectory())
              .withPrefix(outputFilenamePrefix())
              .withSuffix(
                  WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.PARQUET))
              .withNumShards(numShards()));
}
 
Example 2
Source Project: DataflowTemplates   Source File: CsvConverters.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}
 
Example 3
@Test
public void testAllFilesAreConsumed() throws IOException {
  TestStream<String> inputFiles = TestStream.create(StringUtf8Coder.of())
      .addElements(
          createJsonFile("dlqFile1.json", JSON_FILE_CONTENTS_1),
          createJsonFile("dlqFile2.json", JSON_FILE_CONTENTS_1))
      .addElements(createJsonFile("dlqFile3.json", JSON_FILE_CONTENTS_1))
      .advanceWatermarkToInfinity();

  PCollection<String> jsonData = p.apply(inputFiles)
      .apply(FileIO.matchAll())
      .apply(FileBasedDeadLetterQueueReconsumer.moveAndConsumeMatches());

  PAssert.that(jsonData)
      .containsInAnyOrder(
          Stream.of(JSON_FILE_CONTENTS_1)
              .flatMap(line -> Stream.of(line, line, line))
              .collect(Collectors.toList()));

  p.run().waitUntilFinish();
}
 
Example 4
private PCollection<FileShard> runFileShardingPipeline(Metadata fileMetadata, int splitSize) {

    PCollectionView<Map<String, String>> filenamesToTableNamesMapView =
        p.apply(
                "Create File/Table names Map",
                Create.of(
                    ImmutableMap.<String, String>of(
                        fileMetadata.resourceId().toString(), "testtable")))
            .apply(View.asMap());

    return p.apply("Create Metadata", Create.of(fileMetadata))
        .apply(FileIO.readMatches())
        // Pcollection<FileIO.ReadableFile>
        .apply(
            "Split into ranges",
            ParDo.of(new SplitIntoRangesFn(splitSize, filenamesToTableNamesMapView))
                .withSideInputs(filenamesToTableNamesMapView))
        .setCoder(FileShard.Coder.of());
  }
 
Example 5
Source Project: hazelcast-jet-demos   Source File: MyBeamJob.java    License: Apache License 2.0 6 votes vote down vote up
public static Pipeline build(PipelineOptions pipelineOptions) {
	
    Pipeline pipeline = Pipeline.create(pipelineOptions);

	pipeline
	.apply("unbounded-source", 
			Read.from(new MyUnboundedSource("beam-input")))
    .apply("reformat-and-timestamp", 
    		ParDo.of(new MyEnrichAndReformatFn()))
	.apply("window",
			 Window.<String>into(FixedWindows.of(ONE_SECOND))
			 .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()))
			 .discardingFiredPanes()
			 .withAllowedLateness(ONE_SECOND)
			)
    .apply("sink",
    		FileIO.<String>write()
    		.via(TextIO.sink())
            .to(".")
            .withPrefix("beam-output")
            .withNumShards(1)
    		)
	;

    return pipeline;
}
 
Example 6
Source Project: beam   Source File: TestExpansionService.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PTransform<PCollection<GenericRecord>, PCollection<String>> buildExternal(
    StringConfiguration configuration) {
  return new PTransform<PCollection<GenericRecord>, PCollection<String>>() {
    @Override
    public PCollection<String> expand(PCollection<GenericRecord> input) {
      return input
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(schema))
                  .to(configuration.data))
          .getPerDestinationOutputFilenames()
          .apply(Values.create());
    }
  };
}
 
Example 7
Source Project: beam   Source File: XmlIO.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  checkArgument(getRecordClass() != null, "withRecordClass() is required");
  checkArgument(getRootElement() != null, "withRootElement() is required");
  checkArgument(getFilenamePrefix() != null, "to() is required");
  checkArgument(getCharset() != null, "withCharset() is required");
  try {
    JAXBContext.newInstance(getRecordClass());
  } catch (JAXBException e) {
    throw new RuntimeException("Error binding classes to a JAXB Context.", e);
  }

  ResourceId prefix =
      FileSystems.matchNewResource(getFilenamePrefix(), false /* isDirectory */);
  input.apply(
      FileIO.<T>write()
          .via(
              sink(getRecordClass())
                  .withCharset(Charset.forName(getCharset()))
                  .withRootElement(getRootElement()))
          .to(prefix.getCurrentDirectory().toString())
          .withPrefix(prefix.getFilename())
          .withSuffix(".xml")
          .withIgnoreWindowing());
  return PDone.in(input.getPipeline());
}
 
Example 8
Source Project: beam   Source File: TikaIOTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testParseAndParseFiles() throws IOException {
  Path root =
      Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent();

  List<ParseResult> expected =
      Arrays.asList(
          ParseResult.success(
              root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()),
          ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE));

  PCollection<ParseResult> parse =
      p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString()))
          .apply("FilterParse", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parse).containsInAnyOrder(expected);

  PCollection<ParseResult> parseFiles =
      p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(TikaIO.parseFiles())
          .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parseFiles).containsInAnyOrder(expected);
  p.run();
}
 
Example 9
Source Project: beam   Source File: ThriftIO.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(@Element FileIO.ReadableFile file, OutputReceiver<T> out) {
  try {
    InputStream inputStream = Channels.newInputStream(file.open());
    TIOStreamTransport streamTransport =
        new TIOStreamTransport(new BufferedInputStream(inputStream));
    AutoExpandingBufferReadTransport readTransport =
        new AutoExpandingBufferReadTransport(262_144_000);
    readTransport.fill(streamTransport, inputStream.available());
    TProtocol protocol = tProtocol.getProtocol(readTransport);
    while (protocol.getTransport().getBytesRemainingInBuffer() > 0) {
      TBase<?, ?> tb = (TBase<?, ?>) tBaseType.getDeclaredConstructor().newInstance();
      tb.read(protocol);
      out.output((T) tb);
    }
  } catch (Exception ioe) {
    String filename = file.getMetadata().resourceId().toString();
    LOG.error(String.format("Error in reading file: %1$s%n%2$s", filename, ioe));
    throw new RuntimeException(ioe);
  }
}
 
Example 10
Source Project: beam   Source File: ThriftIOTest.java    License: Apache License 2.0 6 votes vote down vote up
/** Tests {@link ThriftIO#readFiles(Class)} with {@link TBinaryProtocol}. */
@Test
public void testReadFilesBinaryProtocol() {

  PCollection<TestThriftStruct> testThriftDoc =
      mainPipeline
          .apply(Create.of(THRIFT_DIR + "data").withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tBinaryProtoFactory));

  // Assert
  PAssert.that(testThriftDoc).containsInAnyOrder(TEST_THRIFT_STRUCT);

  // Execute pipeline
  mainPipeline.run();
}
 
Example 11
Source Project: beam   Source File: ParquetIO.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext processContext) throws Exception {
  FileIO.ReadableFile file = processContext.element();

  if (!file.getMetadata().isReadSeekEfficient()) {
    ResourceId filename = file.getMetadata().resourceId();
    throw new RuntimeException(String.format("File has to be seekable: %s", filename));
  }

  SeekableByteChannel seekableByteChannel = file.openSeekable();

  AvroParquetReader.Builder builder =
      AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel));
  if (modelClass != null) {
    // all GenericData implementations have a static get method
    builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null));
  }

  try (ParquetReader<GenericRecord> reader = builder.build()) {
    GenericRecord read;
    while ((read = reader.read()) != null) {
      processContext.output(read);
    }
  }
}
 
Example 12
Source Project: beam   Source File: ParquetIOTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAndRead() {
  List<GenericRecord> records = generateGenericRecords(1000);

  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(SCHEMA))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(SCHEMA).from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}
 
Example 13
Source Project: beam   Source File: ParquetIOTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAndReadFiles() {
  List<GenericRecord> records = generateGenericRecords(1000);

  PCollection<GenericRecord> writeThenRead =
      mainPipeline
          .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA)))
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(SCHEMA))
                  .to(temporaryFolder.getRoot().getAbsolutePath()))
          .getPerDestinationOutputFilenames()
          .apply(Values.create())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ParquetIO.readFiles(SCHEMA));

  PAssert.that(writeThenRead).containsInAnyOrder(records);

  mainPipeline.run().waitUntilFinish();
}
 
Example 14
Source Project: beam   Source File: ParquetIOTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test(expected = org.apache.beam.sdk.Pipeline.PipelineExecutionException.class)
public void testWriteAndReadUsingReflectDataSchemaWithoutDataModelThrowsException() {
  Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class);

  List<GenericRecord> records = generateGenericRecords(1000);
  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(testRecordSchema))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(testRecordSchema)
              .from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}
 
Example 15
Source Project: beam   Source File: ParquetIOTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAndReadUsingReflectDataSchemaWithDataModel() {
  Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class);

  List<GenericRecord> records = generateGenericRecords(1000);
  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(testRecordSchema))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(testRecordSchema)
              .withAvroDataModel(GenericData.get())
              .from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}
 
Example 16
public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }
 
Example 17
Source Project: dlp-dataflow-deidentification   Source File: UtilTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGetReader() throws IOException {
  Path firstPath = tmpFolder.newFile("first").toPath();
  int firstSize = 37;
  Files.write(firstPath, new byte[firstSize]);

  ValueProvider<String> testValueProvider = null;
  PCollection<String> br =
      p.apply(FileIO.match().filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*"))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(
              ParDo.of(
                  new DoFn<FileIO.ReadableFile, String>() {
                    @ProcessElement
                    public void processElement(
                        @Element FileIO.ReadableFile f, OutputReceiver<String> out)
                        throws IOException {
                      out.output(
                          Util.getReader(
                                  false,
                                  "object_name",
                                  "bucket_name",
                                  f,
                                  "key_name",
                                  testValueProvider)
                              .readLine());
                    }
                  }));
  p.run();
  assertNotNull(br);
}
 
Example 18
Source Project: gcp-ingestion   Source File: Write.java    License: Mozilla Public License 2.0 5 votes vote down vote up
@Override
public WithFailures.Result<PDone, PubsubMessage> expand(PCollection<PubsubMessage> input) {
  ValueProvider<DynamicPathTemplate> pathTemplate = NestedValueProvider.of(outputPrefix,
      DynamicPathTemplate::new);
  ValueProvider<String> staticPrefix = NestedValueProvider.of(pathTemplate,
      value -> value.staticPrefix);

  FileIO.Write<List<String>, PubsubMessage> write = FileIO
      .<List<String>, PubsubMessage>writeDynamic()
      // We can't pass the attribute map to by() directly since MapCoder isn't
      // deterministic;
      // instead, we extract an ordered list of the needed placeholder values.
      // That list is later available to withNaming() to determine output location.
      .by(message -> pathTemplate.get()
          .extractValuesFrom(DerivedAttributesMap.of(message.getAttributeMap())))
      .withDestinationCoder(ListCoder.of(StringUtf8Coder.of())) //
      .withCompression(compression) //
      .via(Contextful.fn(format::encodeSingleMessage), TextIO.sink()) //
      .to(staticPrefix) //
      .withNaming(placeholderValues -> NoColonFileNaming.defaultNaming(
          pathTemplate.get().replaceDynamicPart(placeholderValues), format.suffix()));

  if (inputType == InputType.pubsub) {
    // Passing a ValueProvider to withNumShards disables runner-determined sharding, so we
    // need to be careful to pass this only for streaming input (where runner-determined
    // sharding is not an option).
    write = write.withNumShards(numShards);
  }

  input //
      .apply(Window.<PubsubMessage>into(FixedWindows.of(windowDuration))
          // We allow lateness up to the maximum Cloud Pub/Sub retention of 7 days documented in
          // https://cloud.google.com/pubsub/docs/subscriber
          .withAllowedLateness(Duration.standardDays(7)) //
          .discardingFiredPanes())
      .apply(write);
  return WithFailures.Result.of(PDone.in(input.getPipeline()),
      EmptyErrors.in(input.getPipeline()));
}
 
Example 19
Source Project: DataflowTemplates   Source File: ParquetConverters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public POutput expand(PCollection<GenericRecord> input) {
  return input.apply(
      "WriteParquetFile(s)",
      FileIO.<GenericRecord>write()
          .via(ParquetIO.sink(SchemaUtils.getAvroSchema(schema())))
          .to(outputFile())
          .withNumShards(numShards())
          .withPrefix(outputFilePrefix())
          .withSuffix(PARQUET_SUFFIX));
}
 
Example 20
public PCollection<String> expand(PBegin in) {
  // We want to match all the files in this directory (but not the directories).
  String filePattern = Paths.get(dlqDirectory).resolve("*").toString();
  return in.getPipeline()
      .apply(FileIO.match()
          .filepattern(filePattern)
          .continuously(recheckPeriod, Growth.never()))
      .apply(moveAndConsumeMatches());

}
 
Example 21
@Test
public void testFilesAreConsumed() throws IOException {
  String fileName = createJsonFile("dlqFile1.json", JSON_FILE_CONTENTS_1);
  folder.newFolder("tmp");

  String folderPath = Paths.get(folder.getRoot().getAbsolutePath()).resolve("*").toString();
  PCollection<String> jsonData = p
      .apply(FileIO.match()
          .filepattern(folderPath))
      .apply(FileBasedDeadLetterQueueReconsumer.moveAndConsumeMatches());
  PAssert.that(jsonData).containsInAnyOrder(JSON_FILE_CONTENTS_1);
  p.run().waitUntilFinish();

  assertFalse(new File(fileName).exists());
}
 
Example 22
Source Project: DataflowTemplates   Source File: BigtableToParquet.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Runs a pipeline to export data from a Cloud Bigtable table to Parquet file(s) in GCS.
 *
 * @param options arguments to the pipeline
 */
public static PipelineResult run(Options options) {
  Pipeline pipeline = Pipeline.create(options);
  BigtableIO.Read read =
      BigtableIO.read()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId());

  // Do not validate input fields if it is running as a template.
  if (options.as(DataflowPipelineOptions.class).getTemplateLocation() != null) {
    read = read.withoutValidation();
  }

  /**
   * Steps:
   * 1) Read records from Bigtable.
   * 2) Convert a Bigtable Row to a GenericRecord.
   * 3) Write GenericRecord(s) to GCS in parquet format.
   */
  pipeline
      .apply("Read from Bigtable", read)
      .apply("Transform to Parquet", MapElements.via(new BigtableToParquetFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, BigtableRow.getClassSchema()))
      .apply(
          "Write to Parquet in GCS",
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(BigtableRow.getClassSchema()))
              .to(options.getOutputDirectory())
              .withPrefix(options.getFilenamePrefix())
              .withSuffix(".parquet")
              .withNumShards(options.getNumShards()));

  return pipeline.run();
}
 
Example 23
Source Project: DataflowTemplates   Source File: BulkCompressor.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(Options options) {

  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *   1) Find all files matching the input pattern
   *   2) Compress the files found and output them to the output directory
   *   3) Write any errors to the failure output file
   */
  PCollectionTuple compressOut =
      pipeline
          .apply("Match File(s)", FileIO.match().filepattern(options.getInputFilePattern()))
          .apply(
              "Compress File(s)",
              ParDo.of(new Compressor(options.getOutputDirectory(), options.getCompression()))
                  .withOutputTags(COMPRESSOR_MAIN_OUT, TupleTagList.of(DEADLETTER_TAG)));

  compressOut
      .get(DEADLETTER_TAG)
      .apply(
          "Format Errors",
          MapElements.into(TypeDescriptors.strings())
              .via(kv -> String.format("%s,%s", kv.getKey(), kv.getValue())))
      .apply(
          "Write Error File",
          TextIO.write()
              .to(options.getOutputFailureFile())
              .withHeader("Filename,Error")
              .withoutSharding());

  return pipeline.run();
}
 
Example 24
Source Project: DataflowTemplates   Source File: TextImportTransform.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<ImportManifest> expand(PBegin input) {
  return input
      .apply("Read manifest", FileIO.match().filepattern(importManifest))
      .apply(
          "Resource id",
          MapElements.into(TypeDescriptor.of(ResourceId.class))
              .via((MatchResult.Metadata::resourceId)))
      .apply(
          "Read manifest json",
          MapElements.into(TypeDescriptor.of(ImportManifest.class))
              .via(ReadImportManifest::readManifest));
}
 
Example 25
@Override
public PCollection<Mutation> expand(PCollection<KV<String, String>> filesToTables) {

  // Map<filename,tablename>
  PCollectionView<Map<String, String>> filenamesToTableNamesMapView =
      filesToTables.apply("asView", View.asMap());

  return filesToTables
      .apply("Get Filenames", Keys.create())
      // PCollection<String>
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW))
      // PCollection<Match.Metadata>
      .apply(FileIO.readMatches())
      // Pcollection<FileIO.ReadableFile>
      .apply(
          "Split into ranges",
          ParDo.of(
                  new SplitIntoRangesFn(
                      SplitIntoRangesFn.DEFAULT_BUNDLE_SIZE, filenamesToTableNamesMapView))
              .withSideInputs(filenamesToTableNamesMapView))
      .setCoder(FileShard.Coder.of())
      // PCollection<FileShard>
      .apply("Reshuffle", Reshuffle.viaRandomKey())
      // PCollection<FileShard>

      .apply("Read ranges", ParDo.of(new ReadFileRangesFn(ddlView)).withSideInputs(ddlView));
}
 
Example 26
Source Project: DataflowTemplates   Source File: ImportTransform.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Export> expand(PBegin input) {
  NestedValueProvider<String, String> manifestFile =
      NestedValueProvider.of(importDirectory, s -> GcsUtil.joinPath(s, "spanner-export.json"));
  return input
      .apply("Read manifest", FileIO.match().filepattern(manifestFile))
      .apply(
          "Resource id",
          MapElements.into(TypeDescriptor.of(ResourceId.class))
              .via((MatchResult.Metadata::resourceId)))
      .apply(
          "Read manifest json",
          MapElements.into(TypeDescriptor.of(Export.class))
              .via(ReadExportManifestFile::readManifest));
}
 
Example 27
Source Project: DataflowTemplates   Source File: TextSourceTest.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<String> expand(PCollection<String> files) {
  return files
      // PCollection<String>
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW))
      // PCollection<Match.Metadata>
      .apply(FileIO.readMatches())
      // PCollection<FileIO.ReadableFile>
      .apply("Read lines", ParDo.of(new FileReadDoFn()));
  // PCollection<String>: line
}
 
Example 28
Source Project: beam   Source File: TestExpansionService.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PTransform<PBegin, PCollection<GenericRecord>> buildExternal(
    StringConfiguration configuration) {
  return new PTransform<PBegin, PCollection<GenericRecord>>() {
    @Override
    public PCollection<GenericRecord> expand(PBegin input) {
      return input
          .apply(FileIO.match().filepattern(configuration.data))
          .apply(FileIO.readMatches())
          .apply(ParquetIO.readFiles(schema))
          .setCoder(AvroCoder.of(schema));
    }
  };
}
 
Example 29
Source Project: beam   Source File: XmlIOTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteThenReadLarger() {
  List<Bird> birds = Lists.newArrayList();
  for (int i = 0; i < 100; ++i) {
    birds.add(new Bird("Testing", "Bird number " + i));
  }
  mainPipeline
      .apply(Create.of(birds))
      .apply(
          FileIO.<Bird>write()
              .via(XmlIO.sink(Bird.class).withRootElement("birds"))
              .to(tmpFolder.getRoot().getAbsolutePath())
              .withPrefix("birds")
              .withSuffix(".xml")
              .withNumShards(1));
  mainPipeline.run();

  PCollection<Bird> readBack =
      readPipeline.apply(
          XmlIO.<Bird>read()
              .from(new File(tmpFolder.getRoot(), "birds").getAbsolutePath() + "*")
              .withRecordClass(Bird.class)
              .withRootElement("birds")
              .withRecordElement("bird")
              .withMinBundleSize(100));

  PAssert.that(readBack).containsInAnyOrder(birds);

  readPipeline.run();
}
 
Example 30
Source Project: beam   Source File: TikaIO.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<ParseResult> expand(PBegin input) {
  return input
      .apply(FileIO.match().filepattern(getFilepattern()))
      .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
      .apply(parseFiles());
}