Java Code Examples for org.apache.beam.sdk.io.FileSystems#matchSingleFileSpec()

The following examples show how to use org.apache.beam.sdk.io.FileSystems#matchSingleFileSpec() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GeoCityLookup.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Returns a singleton object for reading from the GeoCity database.
 *
 * <p>We copy the configured database file to a static temp location so that the MaxMind API can
 * save on heap usage by using memory mapping. The reader is threadsafe and this singleton pattern
 * allows multiple worker threads on the same machine to share a single reader instance.
 *
 * <p>Note that we do not clean up the temp mmdb file, but it's a static path, so running locally
 * will overwrite the existing path every time rather than creating an unbounded number of copies.
 * This also assumes that only one JVM per machine is running this code. In the production case
 * where this is running on Cloud Dataflow, we should always have a clean environment and the temp
 * state will be cleaned up along with the workers once the job finishes. However, behavior is
 * undefined if you run multiple local jobs concurrently.
 *
 * @throws IOException if the configured file path is not a valid .mmdb file
 */
private static synchronized DatabaseReader getOrCreateSingletonGeoCityReader(
    ValueProvider<String> geoCityDatabase) throws IOException {
  if (singletonGeoCityReader == null) {
    File mmdb;
    try {
      InputStream inputStream;
      Metadata metadata = FileSystems.matchSingleFileSpec(geoCityDatabase.get());
      ReadableByteChannel channel = FileSystems.open(metadata.resourceId());
      inputStream = Channels.newInputStream(channel);
      Path mmdbPath = Paths.get(System.getProperty("java.io.tmpdir"), "GeoCityLookup.mmdb");
      Files.copy(inputStream, mmdbPath, StandardCopyOption.REPLACE_EXISTING);
      mmdb = mmdbPath.toFile();
    } catch (IOException e) {
      throw new IOException("Exception thrown while fetching configured geoCityDatabase", e);
    }
    singletonGeoCityReader = new DatabaseReader.Builder(mmdb).withCache(new CHMCache()).build();
  }
  return singletonGeoCityReader;
}
 
Example 2
Source File: StreamingDataGenerator.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Setup
public void setup() throws IOException {
  dataGenerator = new JsonDataGeneratorImpl();

  Metadata metadata = FileSystems.matchSingleFileSpec(schemaLocation);

  // Copy the schema file into a string which can be used for generation.
  try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
    try (ReadableByteChannel readerChannel = FileSystems.open(metadata.resourceId())) {
      try (WritableByteChannel writerChannel = Channels.newChannel(byteArrayOutputStream)) {
        ByteStreams.copy(readerChannel, writerChannel);
      }
    }

    schema = byteArrayOutputStream.toString();
  }
}
 
Example 3
Source File: BulkCompressorTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}
 
Example 4
Source File: HashClientInfo.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@VisibleForTesting
byte[] readBytes(String uri) throws IOException {
  Metadata metadata = FileSystems.matchSingleFileSpec(uri);
  ReadableByteChannel inputChannel = FileSystems.open(metadata.resourceId());
  try (InputStream inputStream = Channels.newInputStream(inputChannel)) {
    byte[] key = new byte[32];
    int bytesRead = inputStream.read(key);
    if (bytesRead != 32) {
      throw new KeyLengthMismatchException(bytesRead);
    }
    return key;
  }
}
 
Example 5
Source File: GeoCityLookup.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Returns a singleton object describing allowed cities.
 *
 * @throws IOException if the configured file path does not exist or is in a bad format
 */
private static synchronized Set<Integer> getOrCreateSingletonAllowedCities(
    ValueProvider<String> geoCityFilter) throws IOException {
  if (singletonAllowedCities == null) {
    InputStream inputStream;
    try {
      Metadata metadata = FileSystems.matchSingleFileSpec(geoCityFilter.get());
      ReadableByteChannel channel = FileSystems.open(metadata.resourceId());
      inputStream = Channels.newInputStream(channel);
    } catch (IOException e) {
      throw new IOException("Exception thrown while fetching configured geoCityFilter", e);
    }
    singletonAllowedCities = new HashSet<>();
    BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
    while (reader.ready()) {
      String line = reader.readLine();
      Matcher matcher = GEO_NAME_PATTERN.matcher(line);
      if (matcher.find()) {
        Integer geoNameId = Integer.valueOf(matcher.group(1));
        singletonAllowedCities.add(geoNameId);
      } else {
        throw new IllegalStateException(
            "Line of geoCityFilter file does not begin with a geoName integer ID: " + line);

      }
    }
  }
  return singletonAllowedCities;
}
 
Example 6
Source File: BulkDecompressorTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link BulkDecompressor.Decompress} performs the decompression properly. */
@Test
public void testDecompressCompressedFile() throws Exception {
  // Arrange
  //
  final ValueProvider<String> outputDirectory =
      pipeline.newProvider(tempFolderOutputPath.toString());

  final Metadata compressedFile1Metadata =
      FileSystems.matchSingleFileSpec(compressedFile.toString());

  final Metadata compressedFile2Metadata =
      FileSystems.matchSingleFileSpec(wrongCompressionExtFile.toString());

  final String expectedOutputFilename = Files.getNameWithoutExtension(compressedFile.toString());

  final String expectedOutputFilePath =
      tempFolderOutputPath.resolve(expectedOutputFilename).normalize().toString();

  // Act
  //
  PCollectionTuple decompressOut =
      pipeline
          .apply("CreateWorkItems", Create.of(compressedFile1Metadata, compressedFile2Metadata))
          .apply(
              "Decompress",
              ParDo.of(new Decompress(outputDirectory))
                  .withOutputTags(DECOMPRESS_MAIN_OUT_TAG, TupleTagList.of(DEADLETTER_TAG)));

  // Assert
  //
  PAssert.that(decompressOut.get(DECOMPRESS_MAIN_OUT_TAG))
      .containsInAnyOrder(expectedOutputFilePath);

  PAssert.that(decompressOut.get(DEADLETTER_TAG))
      .satisfies(
          collection -> {
            KV<String, String> kv = collection.iterator().next();
            assertThat(kv.getKey(), is(equalTo(compressedFile2Metadata.resourceId().toString())));
            assertThat(kv.getValue(), is(notNullValue()));
            return null;
          });

  PipelineResult result = pipeline.run();
  result.waitUntilFinish();

  // Validate the uncompressed file written has the expected file content.
  PCollection<String> validatorOut =
      validatorPipeline.apply("ReadOutputFile", TextIO.read().from(expectedOutputFilePath));

  PAssert.that(validatorOut).containsInAnyOrder(FILE_CONTENT);

  validatorPipeline.run();
}
 
Example 7
Source File: BulkDecompressorTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link BulkDecompressor.Decompress} when a matched file is uncompressed. */
@Test
public void testDecompressUncompressedFile() throws Exception {
  // Arrange
  //
  final ValueProvider<String> outputDirectory =
      pipeline.newProvider(tempFolderOutputPath.toString());

  final Metadata uncompressedFileMetadata =
      FileSystems.matchSingleFileSpec(uncompressedFile.toString());

  // Act
  //
  PCollectionTuple decompressOut =
      pipeline
          .apply("CreateWorkItems", Create.of(uncompressedFileMetadata))
          .apply(
              "Decompress",
              ParDo.of(new Decompress(outputDirectory))
                  .withOutputTags(DECOMPRESS_MAIN_OUT_TAG, TupleTagList.of(DEADLETTER_TAG)));

  // Assert
  //
  PAssert.that(decompressOut.get(DECOMPRESS_MAIN_OUT_TAG)).empty();
  PAssert.that(decompressOut.get(DEADLETTER_TAG))
      .satisfies(
          collection -> {
            KV<String, String> kv = collection.iterator().next();
            assertThat(
                kv.getKey(), is(equalTo(uncompressedFileMetadata.resourceId().toString())));
            assertThat(
                kv.getValue(),
                containsString(
                    String.format(
                        "The file resource %s is malformed or not in %s compressed format.",
                        uncompressedFile.toString(), Compression.BZIP2)));
            return null;
          });

  pipeline.run();
}
 
Example 8
Source File: BulkDecompressorTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests the {@link BulkDecompressor.Decompress} when a matched file does not match a known
 * compression.
 */
@Test
public void testDecompressUnknownCompressionFile() throws Exception {
  // Arrange
  //
  final ValueProvider<String> outputDirectory =
      pipeline.newProvider(tempFolderOutputPath.toString());

  final Metadata unknownCompressionFileMetadata =
      FileSystems.matchSingleFileSpec(unknownCompressionFile.toString());

  // Act
  //
  PCollectionTuple decompressOut =
      pipeline
          .apply("CreateWorkItems", Create.of(unknownCompressionFileMetadata))
          .apply(
              "Decompress",
              ParDo.of(new Decompress(outputDirectory))
                  .withOutputTags(DECOMPRESS_MAIN_OUT_TAG, TupleTagList.of(DEADLETTER_TAG)));

  // Assert
  //
  PAssert.that(decompressOut.get(DECOMPRESS_MAIN_OUT_TAG)).empty();
  PAssert.that(decompressOut.get(DEADLETTER_TAG))
      .satisfies(
          collection -> {
            KV<String, String> kv = collection.iterator().next();
            assertThat(
                kv.getKey(), is(equalTo(unknownCompressionFileMetadata.resourceId().toString())));
            assertThat(
                kv.getValue(),
                containsString(
                    String.format(
                        BulkDecompressor.UNCOMPRESSED_ERROR_MSG,
                        unknownCompressionFile.toString(),
                        BulkDecompressor.SUPPORTED_COMPRESSIONS)));
            return null;
          });

  pipeline.run();
}
 
Example 9
Source File: BeamJdbcAvroSchema.java    From dbeam with Apache License 2.0 4 votes vote down vote up
public static Schema parseInputAvroSchemaFile(final String filename) throws IOException {
  MatchResult.Metadata m = FileSystems.matchSingleFileSpec(filename);
  InputStream inputStream = Channels.newInputStream(FileSystems.open(m.resourceId()));

  return new Schema.Parser().parse(inputStream);
}
 
Example 10
Source File: BeamHelper.java    From dbeam with Apache License 2.0 4 votes vote down vote up
public static String readFromFile(final String fileSpec) throws IOException {
  MatchResult.Metadata m = FileSystems.matchSingleFileSpec(fileSpec);
  InputStream inputStream = Channels.newInputStream(FileSystems.open(m.resourceId()));
  return CharStreams.toString(new InputStreamReader(inputStream, Charsets.UTF_8));
}