org.apache.beam.sdk.io.FileIO Java Examples

The following examples show how to use org.apache.beam.sdk.io.FileIO. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileBasedDeadLetterQueueReconsumerTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Test
public void testAllFilesAreConsumed() throws IOException {
  TestStream<String> inputFiles = TestStream.create(StringUtf8Coder.of())
      .addElements(
          createJsonFile("dlqFile1.json", JSON_FILE_CONTENTS_1),
          createJsonFile("dlqFile2.json", JSON_FILE_CONTENTS_1))
      .addElements(createJsonFile("dlqFile3.json", JSON_FILE_CONTENTS_1))
      .advanceWatermarkToInfinity();

  PCollection<String> jsonData = p.apply(inputFiles)
      .apply(FileIO.matchAll())
      .apply(FileBasedDeadLetterQueueReconsumer.moveAndConsumeMatches());

  PAssert.that(jsonData)
      .containsInAnyOrder(
          Stream.of(JSON_FILE_CONTENTS_1)
              .flatMap(line -> Stream.of(line, line, line))
              .collect(Collectors.toList()));

  p.run().waitUntilFinish();
}
 
Example #2
Source File: TestExpansionService.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PTransform<PCollection<GenericRecord>, PCollection<String>> buildExternal(
    StringConfiguration configuration) {
  return new PTransform<PCollection<GenericRecord>, PCollection<String>>() {
    @Override
    public PCollection<String> expand(PCollection<GenericRecord> input) {
      return input
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(schema))
                  .to(configuration.data))
          .getPerDestinationOutputFilenames()
          .apply(Values.create());
    }
  };
}
 
Example #3
Source File: MyBeamJob.java    From hazelcast-jet-demos with Apache License 2.0 6 votes vote down vote up
public static Pipeline build(PipelineOptions pipelineOptions) {
	
    Pipeline pipeline = Pipeline.create(pipelineOptions);

	pipeline
	.apply("unbounded-source", 
			Read.from(new MyUnboundedSource("beam-input")))
    .apply("reformat-and-timestamp", 
    		ParDo.of(new MyEnrichAndReformatFn()))
	.apply("window",
			 Window.<String>into(FixedWindows.of(ONE_SECOND))
			 .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()))
			 .discardingFiredPanes()
			 .withAllowedLateness(ONE_SECOND)
			)
    .apply("sink",
    		FileIO.<String>write()
    		.via(TextIO.sink())
            .to(".")
            .withPrefix("beam-output")
            .withNumShards(1)
    		)
	;

    return pipeline;
}
 
Example #4
Source File: AvroTableFileAsMutationsTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
private PCollection<FileShard> runFileShardingPipeline(Metadata fileMetadata, int splitSize) {

    PCollectionView<Map<String, String>> filenamesToTableNamesMapView =
        p.apply(
                "Create File/Table names Map",
                Create.of(
                    ImmutableMap.<String, String>of(
                        fileMetadata.resourceId().toString(), "testtable")))
            .apply(View.asMap());

    return p.apply("Create Metadata", Create.of(fileMetadata))
        .apply(FileIO.readMatches())
        // Pcollection<FileIO.ReadableFile>
        .apply(
            "Split into ranges",
            ParDo.of(new SplitIntoRangesFn(splitSize, filenamesToTableNamesMapView))
                .withSideInputs(filenamesToTableNamesMapView))
        .setCoder(FileShard.Coder.of());
  }
 
Example #5
Source File: XmlIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  checkArgument(getRecordClass() != null, "withRecordClass() is required");
  checkArgument(getRootElement() != null, "withRootElement() is required");
  checkArgument(getFilenamePrefix() != null, "to() is required");
  checkArgument(getCharset() != null, "withCharset() is required");
  try {
    JAXBContext.newInstance(getRecordClass());
  } catch (JAXBException e) {
    throw new RuntimeException("Error binding classes to a JAXB Context.", e);
  }

  ResourceId prefix =
      FileSystems.matchNewResource(getFilenamePrefix(), false /* isDirectory */);
  input.apply(
      FileIO.<T>write()
          .via(
              sink(getRecordClass())
                  .withCharset(Charset.forName(getCharset()))
                  .withRootElement(getRootElement()))
          .to(prefix.getCurrentDirectory().toString())
          .withPrefix(prefix.getFilename())
          .withSuffix(".xml")
          .withIgnoreWindowing());
  return PDone.in(input.getPipeline());
}
 
Example #6
Source File: TikaIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testParseAndParseFiles() throws IOException {
  Path root =
      Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent();

  List<ParseResult> expected =
      Arrays.asList(
          ParseResult.success(
              root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()),
          ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE));

  PCollection<ParseResult> parse =
      p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString()))
          .apply("FilterParse", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parse).containsInAnyOrder(expected);

  PCollection<ParseResult> parseFiles =
      p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(TikaIO.parseFiles())
          .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parseFiles).containsInAnyOrder(expected);
  p.run();
}
 
Example #7
Source File: ThriftIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(@Element FileIO.ReadableFile file, OutputReceiver<T> out) {
  try {
    InputStream inputStream = Channels.newInputStream(file.open());
    TIOStreamTransport streamTransport =
        new TIOStreamTransport(new BufferedInputStream(inputStream));
    AutoExpandingBufferReadTransport readTransport =
        new AutoExpandingBufferReadTransport(262_144_000);
    readTransport.fill(streamTransport, inputStream.available());
    TProtocol protocol = tProtocol.getProtocol(readTransport);
    while (protocol.getTransport().getBytesRemainingInBuffer() > 0) {
      TBase<?, ?> tb = (TBase<?, ?>) tBaseType.getDeclaredConstructor().newInstance();
      tb.read(protocol);
      out.output((T) tb);
    }
  } catch (Exception ioe) {
    String filename = file.getMetadata().resourceId().toString();
    LOG.error(String.format("Error in reading file: %1$s%n%2$s", filename, ioe));
    throw new RuntimeException(ioe);
  }
}
 
Example #8
Source File: ThriftIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Tests {@link ThriftIO#readFiles(Class)} with {@link TBinaryProtocol}. */
@Test
public void testReadFilesBinaryProtocol() {

  PCollection<TestThriftStruct> testThriftDoc =
      mainPipeline
          .apply(Create.of(THRIFT_DIR + "data").withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tBinaryProtoFactory));

  // Assert
  PAssert.that(testThriftDoc).containsInAnyOrder(TEST_THRIFT_STRUCT);

  // Execute pipeline
  mainPipeline.run();
}
 
Example #9
Source File: CsvConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}
 
Example #10
Source File: WriteToGCSParquet.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public WriteFilesResult<Void> expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as parquet file using {@link FileIO} and {@link ParquetIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       */
      .apply(
          "Writing as Parquet",
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(KeyValueToGenericRecordFn.SCHEMA))
              .to(outputDirectory())
              .withPrefix(outputFilenamePrefix())
              .withSuffix(
                  WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.PARQUET))
              .withNumShards(numShards()));
}
 
Example #11
Source File: ParquetIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAndReadUsingReflectDataSchemaWithDataModel() {
  Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class);

  List<GenericRecord> records = generateGenericRecords(1000);
  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(testRecordSchema))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(testRecordSchema)
              .withAvroDataModel(GenericData.get())
              .from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}
 
Example #12
Source File: ParquetIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test(expected = org.apache.beam.sdk.Pipeline.PipelineExecutionException.class)
public void testWriteAndReadUsingReflectDataSchemaWithoutDataModelThrowsException() {
  Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class);

  List<GenericRecord> records = generateGenericRecords(1000);
  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(testRecordSchema))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(testRecordSchema)
              .from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}
 
Example #13
Source File: ParquetIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext processContext) throws Exception {
  FileIO.ReadableFile file = processContext.element();

  if (!file.getMetadata().isReadSeekEfficient()) {
    ResourceId filename = file.getMetadata().resourceId();
    throw new RuntimeException(String.format("File has to be seekable: %s", filename));
  }

  SeekableByteChannel seekableByteChannel = file.openSeekable();

  AvroParquetReader.Builder builder =
      AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel));
  if (modelClass != null) {
    // all GenericData implementations have a static get method
    builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null));
  }

  try (ParquetReader<GenericRecord> reader = builder.build()) {
    GenericRecord read;
    while ((read = reader.read()) != null) {
      processContext.output(read);
    }
  }
}
 
Example #14
Source File: ParquetIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAndRead() {
  List<GenericRecord> records = generateGenericRecords(1000);

  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(SCHEMA))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(SCHEMA).from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}
 
Example #15
Source File: ParquetIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAndReadFiles() {
  List<GenericRecord> records = generateGenericRecords(1000);

  PCollection<GenericRecord> writeThenRead =
      mainPipeline
          .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA)))
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(SCHEMA))
                  .to(temporaryFolder.getRoot().getAbsolutePath()))
          .getPerDestinationOutputFilenames()
          .apply(Values.create())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ParquetIO.readFiles(SCHEMA));

  PAssert.that(writeThenRead).containsInAnyOrder(records);

  mainPipeline.run().waitUntilFinish();
}
 
Example #16
Source File: TextStreamingPipeline.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }
 
Example #17
Source File: ParquetIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<GenericRecord> expand(PBegin input) {
  checkNotNull(getFilepattern(), "Filepattern cannot be null.");

  return input
      .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of()))
      .apply(FileIO.matchAll())
      .apply(FileIO.readMatches())
      .apply(readFiles(getSchema()).withAvroDataModel(getAvroDataModel()));
}
 
Example #18
Source File: TestExpansionService.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PTransform<PBegin, PCollection<GenericRecord>> buildExternal(
    StringConfiguration configuration) {
  return new PTransform<PBegin, PCollection<GenericRecord>>() {
    @Override
    public PCollection<GenericRecord> expand(PBegin input) {
      return input
          .apply(FileIO.match().filepattern(configuration.data))
          .apply(FileIO.readMatches())
          .apply(ParquetIO.readFiles(schema))
          .setCoder(AvroCoder.of(schema));
    }
  };
}
 
Example #19
Source File: Transforms.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransform} from file name patterns to file {@link Metadata Metadata records}.
 */
public static PTransform<PCollection<String>, PCollection<Metadata>> getFilesByPatterns() {
  return new PTransform<PCollection<String>, PCollection<Metadata>>() {
    @Override
    public PCollection<Metadata> expand(PCollection<String> input) {
      return input.apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW));
    }
  };
}
 
Example #20
Source File: XmlIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteThenReadLarger() {
  List<Bird> birds = Lists.newArrayList();
  for (int i = 0; i < 100; ++i) {
    birds.add(new Bird("Testing", "Bird number " + i));
  }
  mainPipeline
      .apply(Create.of(birds))
      .apply(
          FileIO.<Bird>write()
              .via(XmlIO.sink(Bird.class).withRootElement("birds"))
              .to(tmpFolder.getRoot().getAbsolutePath())
              .withPrefix("birds")
              .withSuffix(".xml")
              .withNumShards(1));
  mainPipeline.run();

  PCollection<Bird> readBack =
      readPipeline.apply(
          XmlIO.<Bird>read()
              .from(new File(tmpFolder.getRoot(), "birds").getAbsolutePath() + "*")
              .withRecordClass(Bird.class)
              .withRootElement("birds")
              .withRecordElement("bird")
              .withMinBundleSize(100));

  PAssert.that(readBack).containsInAnyOrder(birds);

  readPipeline.run();
}
 
Example #21
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<ParseResult> expand(PBegin input) {
  return input
      .apply(FileIO.match().filepattern(getFilepattern()))
      .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
      .apply(parseFiles());
}
 
Example #22
Source File: ThriftIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<FileIO.ReadableFile> input) {
  checkNotNull(getRecordClass(), "Record class cannot be null");
  checkNotNull(getTProtocolFactory(), "Thrift protocol cannot be null");

  return input
      .apply(ParDo.of(new ReadFn<>(getRecordClass(), getTProtocolFactory())))
      .setCoder(ThriftCoder.of(getRecordClass(), getTProtocolFactory()));
}
 
Example #23
Source File: ThriftIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link
 * TBinaryProtocol}.
 */
@Test
public void testReadWriteBinaryProtocol() {

  mainPipeline
      .apply(
          Create.of(testThriftStructs)
              .withCoder(ThriftCoder.of(TestThriftStruct.class, tBinaryProtoFactory)))
      .apply(
          FileIO.<TestThriftStruct>write()
              .via(ThriftIO.sink(tBinaryProtoFactory))
              .to(temporaryFolder.getRoot().getAbsolutePath()));

  // Execute write pipeline
  mainPipeline.run().waitUntilFinish();

  // Read written files
  PCollection<TestThriftStruct> readDocs =
      readPipeline
          .apply(
              Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*")
                  .withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tBinaryProtoFactory));

  // Assert
  PAssert.that(readDocs).containsInAnyOrder(testThriftStructs);

  // Execute read pipeline
  readPipeline.run().waitUntilFinish();
}
 
Example #24
Source File: ThriftIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link
 * TJSONProtocol}.
 */
@Test
public void testReadWriteJsonProtocol() {

  mainPipeline
      .apply(
          Create.of(testThriftStructs)
              .withCoder(ThriftCoder.of(TestThriftStruct.class, tJsonProtocolFactory)))
      .apply(
          FileIO.<TestThriftStruct>write()
              .via(ThriftIO.sink(tJsonProtocolFactory))
              .to(temporaryFolder.getRoot().getAbsolutePath()));

  // Execute write pipeline
  mainPipeline.run().waitUntilFinish();

  // Read written files
  PCollection<TestThriftStruct> readDocs =
      readPipeline
          .apply(
              Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*")
                  .withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tJsonProtocolFactory));

  // Assert
  PAssert.that(readDocs).containsInAnyOrder(testThriftStructs);

  // Execute read pipeline
  readPipeline.run().waitUntilFinish();
}
 
Example #25
Source File: ThriftIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link
 * TCompactProtocol}.
 */
@Test
public void testReadWriteCompactProtocol() {

  mainPipeline
      .apply(
          Create.of(testThriftStructs)
              .withCoder(ThriftCoder.of(TestThriftStruct.class, tCompactProtocolFactory)))
      .apply(
          FileIO.<TestThriftStruct>write()
              .via(ThriftIO.sink(tCompactProtocolFactory))
              .to(temporaryFolder.getRoot().getAbsolutePath()));

  // Execute write pipeline
  mainPipeline.run().waitUntilFinish();

  // Read written files
  PCollection<TestThriftStruct> readDocs =
      readPipeline
          .apply(
              Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*")
                  .withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(
              ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tCompactProtocolFactory));

  // Assert
  PAssert.that(readDocs).containsInAnyOrder(testThriftStructs);

  // Execute read pipeline
  readPipeline.run().waitUntilFinish();
}
 
Example #26
Source File: Transforms.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity} using
 * caller-provided {@code transformer}.
 */
static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>> processFiles(
    DoFn<ReadableFile, VersionedEntity> transformer) {
  return new PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>() {
    @Override
    public PCollection<VersionedEntity> expand(PCollection<Metadata> input) {
      return input
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(transformer.getClass().getSimpleName(), ParDo.of(transformer));
      // TODO(weiminyu): reshuffle to enable dynamic work rebalance per beam dev guide
    }
  };
}
 
Example #27
Source File: SnowflakeIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PBegin input) {
  checkArguments();

  String tmpDirName = makeTmpDirName();
  String stagingBucketDir = String.format("%s/%s/", getStagingBucketName(), tmpDirName);

  PCollection<Void> emptyCollection = input.apply(Create.of((Void) null));

  PCollection<T> output =
      emptyCollection
          .apply(
              ParDo.of(
                  new CopyIntoStageFn(
                      getDataSourceProviderFn(),
                      getQuery(),
                      getTable(),
                      getStorageIntegrationName(),
                      stagingBucketDir,
                      getSnowflakeService())))
          .apply(Reshuffle.viaRandomKey())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(readFiles())
          .apply(ParDo.of(new MapCsvToStringArrayFn()))
          .apply(ParDo.of(new MapStringArrayToUserDataFn<>(getCsvMapper())));

  output.setCoder(getCoder());

  emptyCollection
      .apply(Wait.on(output))
      .apply(ParDo.of(new CleanTmpFilesFromGcsFn(stagingBucketDir)));
  return output;
}
 
Example #28
Source File: SnowflakeIO.java    From beam with Apache License 2.0 5 votes vote down vote up
private PCollection<String> writeFiles(PCollection<T> input, String stagingBucketDir) {

      PCollection<String> mappedUserData =
          input
              .apply(
                  MapElements.via(
                      new SimpleFunction<T, Object[]>() {
                        @Override
                        public Object[] apply(T element) {
                          return getUserDataMapper().mapRow(element);
                        }
                      }))
              .apply("Map Objects array to CSV lines", ParDo.of(new MapObjectsArrayToCsvFn()))
              .setCoder(StringUtf8Coder.of());

      WriteFilesResult filesResult =
          mappedUserData.apply(
              "Write files to specified location",
              FileIO.<String>write()
                  .via(TextIO.sink())
                  .to(stagingBucketDir)
                  .withPrefix(getFileNameTemplate())
                  .withSuffix(".csv")
                  .withCompression(Compression.GZIP));

      return (PCollection)
          filesResult
              .getPerDestinationOutputFilenames()
              .apply("Parse KV filenames to Strings", Values.<String>create());
    }
 
Example #29
Source File: ParquetIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<GenericRecord> expand(PCollection<FileIO.ReadableFile> input) {
  checkNotNull(getSchema(), "Schema can not be null");
  return input
      .apply(ParDo.of(new ReadFn(getAvroDataModel())))
      .setCoder(AvroCoder.of(getSchema()));
}
 
Example #30
Source File: ImportTransform.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Export> expand(PBegin input) {
  NestedValueProvider<String, String> manifestFile =
      NestedValueProvider.of(importDirectory, s -> GcsUtil.joinPath(s, "spanner-export.json"));
  return input
      .apply("Read manifest", FileIO.match().filepattern(manifestFile))
      .apply(
          "Resource id",
          MapElements.into(TypeDescriptor.of(ResourceId.class))
              .via((MatchResult.Metadata::resourceId)))
      .apply(
          "Read manifest json",
          MapElements.into(TypeDescriptor.of(Export.class))
              .via(ReadExportManifestFile::readManifest));
}