org.apache.beam.sdk.io.fs.MatchResult.Metadata Java Examples

The following examples show how to use org.apache.beam.sdk.io.fs.MatchResult.Metadata. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CompressedSource.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Creates a {@code CompressedSource} for an individual file. Used by {@link
 * CompressedSource#createForSubrangeOfFile}.
 */
private CompressedSource(
    FileBasedSource<T> sourceDelegate,
    DecompressingChannelFactory channelFactory,
    Metadata metadata,
    long minBundleSize,
    long startOffset,
    long endOffset) {
  super(metadata, minBundleSize, startOffset, endOffset);
  this.sourceDelegate = sourceDelegate;
  this.channelFactory = channelFactory;
  boolean splittable;
  try {
    splittable = isSplittable();
  } catch (Exception e) {
    throw new RuntimeException("Failed to determine if the source is splittable", e);
  }
  checkArgument(
      splittable || startOffset == 0,
      "CompressedSources must start reading at offset 0. Requested offset: %s",
      startOffset);
}
 
Example #2
Source File: SplitIntoRangesFn.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws FileNotFoundException {
  Map<String, String> filenamesToTableNamesMap = c.sideInput(filenamesToTableNamesMapView);
  Metadata metadata = c.element().getMetadata();
  String filename = metadata.resourceId().toString();
  String tableName = filenamesToTableNamesMap.get(filename);

  if (tableName == null) {
    throw new FileNotFoundException(
        "Unknown table name for file:" + filename + " in map " + filenamesToTableNamesMap);
  }

  if (!metadata.isReadSeekEfficient()) {
    // Do not shard the file.
    c.output(FileShard.create(tableName, c.element(), new OffsetRange(0, metadata.sizeBytes())));
  } else {
    // Create shards.
    for (OffsetRange range :
        new OffsetRange(0, metadata.sizeBytes()).split(desiredBundleSize, 0)) {
      c.output(FileShard.create(tableName, c.element(), range));
    }
  }
}
 
Example #3
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadRangeAtEnd() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  List<String> data = createStringDataset(3, 50);

  String fileName = "file";
  File file = createFileWithData(fileName, data);

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 162, null);
  TestFileBasedSource source2 =
      new TestFileBasedSource(metadata, 1024, 162, Long.MAX_VALUE, null);

  List<String> results = new ArrayList<>();
  results.addAll(readFromSource(source1, options));
  results.addAll(readFromSource(source2, options));

  assertThat(data, containsInAnyOrder(results.toArray()));
}
 
Example #4
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSplitAtFraction() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  File file = createFileWithData("file", createStringDataset(3, 100));

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
  // Shouldn't be able to split while unstarted.
  assertSplitAtFractionFails(source, 0, 0.7, options);
  assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.7, options);
  assertSplitAtFractionSucceedsAndConsistent(source, 30, 0.7, options);
  assertSplitAtFractionFails(source, 0, 0.0, options);
  assertSplitAtFractionFails(source, 70, 0.3, options);
  assertSplitAtFractionFails(source, 100, 1.0, options);
  assertSplitAtFractionFails(source, 100, 0.99, options);
  assertSplitAtFractionSucceedsAndConsistent(source, 100, 0.995, options);
}
 
Example #5
Source File: StreamingDataGenerator.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Setup
public void setup() throws IOException {
  dataGenerator = new JsonDataGeneratorImpl();

  Metadata metadata = FileSystems.matchSingleFileSpec(schemaLocation);

  // Copy the schema file into a string which can be used for generation.
  try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
    try (ReadableByteChannel readerChannel = FileSystems.open(metadata.resourceId())) {
      try (WritableByteChannel writerChannel = Channels.newChannel(byteArrayOutputStream)) {
        ByteStreams.copy(readerChannel, writerChannel);
      }
    }

    schema = byteArrayOutputStream.toString();
  }
}
 
Example #6
Source File: GeoCityLookup.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Returns a singleton object for reading from the GeoCity database.
 *
 * <p>We copy the configured database file to a static temp location so that the MaxMind API can
 * save on heap usage by using memory mapping. The reader is threadsafe and this singleton pattern
 * allows multiple worker threads on the same machine to share a single reader instance.
 *
 * <p>Note that we do not clean up the temp mmdb file, but it's a static path, so running locally
 * will overwrite the existing path every time rather than creating an unbounded number of copies.
 * This also assumes that only one JVM per machine is running this code. In the production case
 * where this is running on Cloud Dataflow, we should always have a clean environment and the temp
 * state will be cleaned up along with the workers once the job finishes. However, behavior is
 * undefined if you run multiple local jobs concurrently.
 *
 * @throws IOException if the configured file path is not a valid .mmdb file
 */
private static synchronized DatabaseReader getOrCreateSingletonGeoCityReader(
    ValueProvider<String> geoCityDatabase) throws IOException {
  if (singletonGeoCityReader == null) {
    File mmdb;
    try {
      InputStream inputStream;
      Metadata metadata = FileSystems.matchSingleFileSpec(geoCityDatabase.get());
      ReadableByteChannel channel = FileSystems.open(metadata.resourceId());
      inputStream = Channels.newInputStream(channel);
      Path mmdbPath = Paths.get(System.getProperty("java.io.tmpdir"), "GeoCityLookup.mmdb");
      Files.copy(inputStream, mmdbPath, StandardCopyOption.REPLACE_EXISTING);
      mmdb = mmdbPath.toFile();
    } catch (IOException e) {
      throw new IOException("Exception thrown while fetching configured geoCityDatabase", e);
    }
    singletonGeoCityReader = new DatabaseReader.Builder(mmdb).withCache(new CHMCache()).build();
  }
  return singletonGeoCityReader;
}
 
Example #7
Source File: BulkCompressorTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}
 
Example #8
Source File: HadoopFileSystemTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testMatchDirectory() throws Exception {
  create("dir/file", "data".getBytes(StandardCharsets.UTF_8));
  final MatchResult matchResult =
      Iterables.getOnlyElement(
          fileSystem.match(Collections.singletonList(testPath("dir").toString())));
  assertThat(
      matchResult,
      equalTo(
          MatchResult.create(
              Status.OK,
              ImmutableList.of(
                  Metadata.builder()
                      .setResourceId(testPath("dir"))
                      .setIsReadSeekEfficient(true)
                      .setSizeBytes(0L)
                      .setLastModifiedMillis(lastModified("dir"))
                      .build()))));
}
 
Example #9
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadRangeAtStart() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  List<String> data = createStringDataset(3, 50);

  String fileName = "file";
  File file = createFileWithData(fileName, data);

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 25, null);
  TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 25, Long.MAX_VALUE, null);

  List<String> results = new ArrayList<>();
  results.addAll(readFromSource(source1, options));
  results.addAll(readFromSource(source2, options));
  assertThat(data, containsInAnyOrder(results.toArray()));
}
 
Example #10
Source File: FilePatternMatchingShardedFile.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Reads all the lines of all the files.
 *
 * <p>Not suitable for use except in testing of small data, since the data size may be far more
 * than can be reasonably processed serially, in-memory, by a single thread.
 */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
  List<String> allLines = Lists.newArrayList();
  int i = 1;
  for (Metadata file : files) {
    try (Reader reader =
        Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
      List<String> lines = CharStreams.readLines(reader);
      allLines.addAll(lines);
      LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
    }
    i++;
  }
  return allLines;
}
 
Example #11
Source File: NumberedShardedFile.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Reads all the lines of all the files.
 *
 * <p>Not suitable for use except in testing of small data, since the data size may be far more
 * than can be reasonably processed serially, in-memory, by a single thread.
 */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
  List<String> allLines = Lists.newArrayList();
  int i = 1;
  for (Metadata file : files) {
    try (Reader reader =
        Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
      List<String> lines = CharStreams.readLines(reader);
      allLines.addAll(lines);
      LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
    }
    i++;
  }
  return allLines;
}
 
Example #12
Source File: NumberedShardedFile.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Check if total number of files is correct by comparing with the number that is parsed from
 * shard name using a name template. If no template is specified, "SSSS-of-NNNN" will be used as
 * default, and "NNNN" will be the expected total number of files.
 *
 * @return {@code true} if at least one shard name matches template and total number of given
 *     files equals the number that is parsed from shard name.
 */
@VisibleForTesting
boolean checkTotalNumOfFiles(Collection<Metadata> files) {
  for (Metadata fileMedadata : files) {
    String fileName = fileMedadata.resourceId().getFilename();

    if (fileName == null) {
      // this path has zero elements
      continue;
    }
    Matcher matcher = shardTemplate.matcher(fileName);
    if (!matcher.matches()) {
      // shard name doesn't match the pattern, check with the next shard
      continue;
    }
    // once match, extract total number of shards and compare to file list
    return files.size() == Integer.parseInt(matcher.group("numshards"));
  }
  return false;
}
 
Example #13
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromMetadata() throws Exception {
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
  String codec = DataFileConstants.NULL_CODEC;
  String filename =
      generateTestFile(
          codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
  Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);

  AvroSource<GenericRecord> source = AvroSource.from(fileMeta);
  AvroSource<Bird> sourceWithSchema = source.withSchema(Bird.class);
  AvroSource<Bird> sourceWithSchemaWithMinBundleSize = sourceWithSchema.withMinBundleSize(1234);

  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, source.getMode());
  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchema.getMode());
  assertEquals(
      FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchemaWithMinBundleSize.getMode());
}
 
Example #14
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadMetadataWithCodecs() throws Exception {
  // Test reading files generated using all codecs.
  String[] codecs = {
    DataFileConstants.NULL_CODEC,
    DataFileConstants.BZIP2_CODEC,
    DataFileConstants.DEFLATE_CODEC,
    DataFileConstants.SNAPPY_CODEC,
    DataFileConstants.XZ_CODEC
  };
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);

  for (String codec : codecs) {
    String filename =
        generateTestFile(
            codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);

    Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
    AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
    assertEquals(codec, metadata.getCodec());
  }
}
 
Example #15
Source File: FileSystems.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the {@link Metadata} for a single file resource. Expects a resource specification
 * {@code spec} that matches a single result.
 *
 * @param spec a resource specification that matches exactly one result.
 * @return the {@link Metadata} for the specified resource.
 * @throws FileNotFoundException if the file resource is not found.
 * @throws IOException in the event of an error in the inner call to {@link #match}, or if the
 *     given spec does not match exactly 1 result.
 */
public static Metadata matchSingleFileSpec(String spec) throws IOException {
  List<MatchResult> matches = FileSystems.match(Collections.singletonList(spec));
  MatchResult matchResult = Iterables.getOnlyElement(matches);
  if (matchResult.status() == Status.NOT_FOUND) {
    throw new FileNotFoundException(String.format("File spec %s not found", spec));
  } else if (matchResult.status() != Status.OK) {
    throw new IOException(
        String.format("Error matching file spec %s: status %s", spec, matchResult.status()));
  } else {
    List<Metadata> metadata = matchResult.metadata();
    if (metadata.size() != 1) {
      throw new IOException(
          String.format(
              "Expecting spec %s to match exactly one file, but matched %s: %s",
              spec, metadata.size(), metadata));
    }
    return metadata.get(0);
  }
}
 
Example #16
Source File: TFRecordIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadFilesNamed() {
  readPipeline.enableAbandonedNodeEnforcement(false);

  Metadata metadata =
      Metadata.builder()
          .setResourceId(FileSystems.matchNewResource("file", false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .build();
  Create.Values<ReadableFile> create = Create.of(new ReadableFile(metadata, Compression.AUTO));

  assertEquals(
      "TFRecordIO.ReadFiles/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output",
      readPipeline.apply(create).apply(TFRecordIO.readFiles()).getName());
  assertEquals(
      "MyRead/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output",
      readPipeline.apply(create).apply("MyRead", TFRecordIO.readFiles()).getName());
}
 
Example #17
Source File: AvroTableFileAsMutationsTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
private PCollection<FileShard> runFileShardingPipeline(Metadata fileMetadata, int splitSize) {

    PCollectionView<Map<String, String>> filenamesToTableNamesMapView =
        p.apply(
                "Create File/Table names Map",
                Create.of(
                    ImmutableMap.<String, String>of(
                        fileMetadata.resourceId().toString(), "testtable")))
            .apply(View.asMap());

    return p.apply("Create Metadata", Create.of(fileMetadata))
        .apply(FileIO.readMatches())
        // Pcollection<FileIO.ReadableFile>
        .apply(
            "Split into ranges",
            ParDo.of(new SplitIntoRangesFn(splitSize, filenamesToTableNamesMapView))
                .withSideInputs(filenamesToTableNamesMapView))
        .setCoder(FileShard.Coder.of());
  }
 
Example #18
Source File: HadoopFileSystemTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testRename() throws Exception {
  create("testFileA", "testDataA".getBytes(StandardCharsets.UTF_8));
  create("testFileB", "testDataB".getBytes(StandardCharsets.UTF_8));

  // ensure files exist
  assertArrayEquals("testDataA".getBytes(StandardCharsets.UTF_8), read("testFileA", 0));
  assertArrayEquals("testDataB".getBytes(StandardCharsets.UTF_8), read("testFileB", 0));

  fileSystem.rename(
      ImmutableList.of(testPath("testFileA"), testPath("testFileB")),
      ImmutableList.of(testPath("renameFileA"), testPath("renameFileB")));

  List<MatchResult> results = fileSystem.match(ImmutableList.of(testPath("*").toString()));
  assertEquals(Status.OK, Iterables.getOnlyElement(results).status());
  assertThat(
      Iterables.getOnlyElement(results).metadata(),
      containsInAnyOrder(
          Metadata.builder()
              .setResourceId(testPath("renameFileA"))
              .setIsReadSeekEfficient(true)
              .setSizeBytes("testDataA".getBytes(StandardCharsets.UTF_8).length)
              .setLastModifiedMillis(lastModified("renameFileA"))
              .build(),
          Metadata.builder()
              .setResourceId(testPath("renameFileB"))
              .setIsReadSeekEfficient(true)
              .setSizeBytes("testDataB".getBytes(StandardCharsets.UTF_8).length)
              .setLastModifiedMillis(lastModified("renameFileB"))
              .build()));

  // ensure files exist
  assertArrayEquals("testDataA".getBytes(StandardCharsets.UTF_8), read("renameFileA", 0));
  assertArrayEquals("testDataB".getBytes(StandardCharsets.UTF_8), read("renameFileB", 0));
}
 
Example #19
Source File: MetadataCoderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test(expected = AssertionError.class)
public void testEncodeDecodeWithCustomLastModifiedMills() throws Exception {
  Path filePath = tmpFolder.newFile("somefile").toPath();
  Metadata metadata =
      Metadata.builder()
          .setResourceId(
              FileSystems.matchNewResource(filePath.toString(), false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .setLastModifiedMillis(1541097000L)
          .build();
  // This should throw because the decoded Metadata has default lastModifiedMills.
  CoderProperties.coderDecodeEncodeEqual(MetadataCoder.of(), metadata);
}
 
Example #20
Source File: MetadataCoderV2Test.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncodeDecodeWithDefaultLastModifiedMills() throws Exception {
  Path filePath = tmpFolder.newFile("somefile").toPath();
  Metadata metadata =
      Metadata.builder()
          .setResourceId(
              FileSystems.matchNewResource(filePath.toString(), false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .build();
  CoderProperties.coderDecodeEncodeEqual(MetadataCoderV2.of(), metadata);
}
 
Example #21
Source File: MetadataCoderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncodeDecodeWithDefaultLastModifiedMills() throws Exception {
  Path filePath = tmpFolder.newFile("somefile").toPath();
  Metadata metadata =
      Metadata.builder()
          .setResourceId(
              FileSystems.matchNewResource(filePath.toString(), false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .build();
  CoderProperties.coderDecodeEncodeEqual(MetadataCoder.of(), metadata);
}
 
Example #22
Source File: Transforms.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransform} from file name patterns to file {@link Metadata Metadata records}.
 */
public static PTransform<PCollection<String>, PCollection<Metadata>> getFilesByPatterns() {
  return new PTransform<PCollection<String>, PCollection<Metadata>>() {
    @Override
    public PCollection<Metadata> expand(PCollection<String> input) {
      return input.apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW));
    }
  };
}
 
Example #23
Source File: FileBasedSource.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public final long getMaxEndOffset(PipelineOptions options) throws IOException {
  checkArgument(
      mode != Mode.FILEPATTERN, "Cannot determine the exact end offset of a file pattern");
  Metadata metadata = getSingleFileMetadata();
  return metadata.sizeBytes();
}
 
Example #24
Source File: Transforms.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/** Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity}. */
public static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>
    loadExportDataFromFiles() {
  return processFiles(
      new BackupFileReader(
          file ->
              Iterators.transform(
                  LevelDbLogReader.from(file.open()),
                  (byte[] bytes) -> VersionedEntity.from(EXPORT_ENTITY_TIME_STAMP, bytes))));
}
 
Example #25
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
public TestFileBasedSource(
    Metadata fileOrPattern,
    long minBundleSize,
    long startOffset,
    long endOffset,
    @Nullable String splitHeader) {
  super(fileOrPattern, minBundleSize, startOffset, endOffset);
  this.splitHeader = splitHeader;
}
 
Example #26
Source File: FileBasedSource.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the information about the single file that this source is reading from.
 *
 * @throws IllegalArgumentException if this source is in {@link Mode#FILEPATTERN} mode.
 */
protected final MatchResult.Metadata getSingleFileMetadata() {
  checkArgument(
      mode == Mode.SINGLE_FILE_OR_SUBRANGE,
      "This function should only be called for a single file, not %s",
      this);
  checkState(
      singleFileMetadata != null,
      "It should not be possible to construct a %s in mode %s with null metadata: %s",
      FileBasedSource.class,
      mode,
      this);
  return singleFileMetadata;
}
 
Example #27
Source File: CompressedSource.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a {@code CompressedSource} for a subrange of a file. Called by superclass to create a
 * source for a single file.
 */
@Override
protected FileBasedSource<T> createForSubrangeOfFile(Metadata metadata, long start, long end) {
  return new CompressedSource<>(
      sourceDelegate.createForSubrangeOfFile(metadata, start, end),
      channelFactory,
      metadata,
      sourceDelegate.getMinBundleSize(),
      start,
      end);
}
 
Example #28
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSplitAtFractionExhaustive() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  // Smaller file for exhaustive testing.
  File file = createFileWithData("file", createStringDataset(3, 20));

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
  assertSplitAtFractionExhaustive(source, options);
}
 
Example #29
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadRangeFromFileWithSplitsFromStart() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  String header = "<h>";
  List<String> data = new ArrayList<>();
  for (int i = 0; i < 10; i++) {
    data.add(header);
    data.addAll(createStringDataset(3, 9));
  }
  String fileName = "file";
  File file = createFileWithData(fileName, data);

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 60, header);
  TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 60, Long.MAX_VALUE, header);

  List<String> expectedResults = new ArrayList<>();
  expectedResults.addAll(data);
  // Remove all occurrences of header from expected results.
  expectedResults.removeAll(Arrays.asList(header));

  List<String> results = new ArrayList<>();
  results.addAll(readFromSource(source1, options));
  results.addAll(readFromSource(source2, options));

  assertThat(expectedResults, containsInAnyOrder(results.toArray()));
}
 
Example #30
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadRangeFromFileWithSplitsFromMiddle() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  String header = "<h>";
  List<String> data = new ArrayList<>();
  for (int i = 0; i < 10; i++) {
    data.add(header);
    data.addAll(createStringDataset(3, 9));
  }
  String fileName = "file";
  File file = createFileWithData(fileName, data);

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 42, header);
  TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 42, 112, header);
  TestFileBasedSource source3 =
      new TestFileBasedSource(metadata, 64, 112, Long.MAX_VALUE, header);

  List<String> expectedResults = new ArrayList<>();

  expectedResults.addAll(data);
  // Remove all occurrences of header from expected results.
  expectedResults.removeAll(Collections.singletonList(header));

  List<String> results = new ArrayList<>();
  results.addAll(readFromSource(source1, options));
  results.addAll(readFromSource(source2, options));
  results.addAll(readFromSource(source3, options));

  assertThat(expectedResults, containsInAnyOrder(results.toArray()));
}