org.apache.beam.sdk.io.fs.MatchResult.Metadata Java Exaples

Source File: CompressedSource.java From beam with Apache License 2.0

6 votes

/**
 * Creates a {@code CompressedSource} for an individual file. Used by {@link
 * CompressedSource#createForSubrangeOfFile}.
 */
private CompressedSource(
    FileBasedSource<T> sourceDelegate,
    DecompressingChannelFactory channelFactory,
    Metadata metadata,
    long minBundleSize,
    long startOffset,
    long endOffset) {
  super(metadata, minBundleSize, startOffset, endOffset);
  this.sourceDelegate = sourceDelegate;
  this.channelFactory = channelFactory;
  boolean splittable;
  try {
    splittable = isSplittable();
  } catch (Exception e) {
    throw new RuntimeException("Failed to determine if the source is splittable", e);
  }
  checkArgument(
      splittable || startOffset == 0,
      "CompressedSources must start reading at offset 0. Requested offset: %s",
      startOffset);
}

Source File: SplitIntoRangesFn.java From DataflowTemplates with Apache License 2.0

6 votes

@ProcessElement
public void processElement(ProcessContext c) throws FileNotFoundException {
  Map<String, String> filenamesToTableNamesMap = c.sideInput(filenamesToTableNamesMapView);
  Metadata metadata = c.element().getMetadata();
  String filename = metadata.resourceId().toString();
  String tableName = filenamesToTableNamesMap.get(filename);

  if (tableName == null) {
    throw new FileNotFoundException(
        "Unknown table name for file:" + filename + " in map " + filenamesToTableNamesMap);
  }

  if (!metadata.isReadSeekEfficient()) {
    // Do not shard the file.
    c.output(FileShard.create(tableName, c.element(), new OffsetRange(0, metadata.sizeBytes())));
  } else {
    // Create shards.
    for (OffsetRange range :
        new OffsetRange(0, metadata.sizeBytes()).split(desiredBundleSize, 0)) {
      c.output(FileShard.create(tableName, c.element(), range));
    }
  }
}

Source File: FileBasedSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testReadRangeAtEnd() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  List<String> data = createStringDataset(3, 50);

  String fileName = "file";
  File file = createFileWithData(fileName, data);

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 162, null);
  TestFileBasedSource source2 =
      new TestFileBasedSource(metadata, 1024, 162, Long.MAX_VALUE, null);

  List<String> results = new ArrayList<>();
  results.addAll(readFromSource(source1, options));
  results.addAll(readFromSource(source2, options));

  assertThat(data, containsInAnyOrder(results.toArray()));
}

Source File: FileBasedSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testSplitAtFraction() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  File file = createFileWithData("file", createStringDataset(3, 100));

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
  // Shouldn't be able to split while unstarted.
  assertSplitAtFractionFails(source, 0, 0.7, options);
  assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.7, options);
  assertSplitAtFractionSucceedsAndConsistent(source, 30, 0.7, options);
  assertSplitAtFractionFails(source, 0, 0.0, options);
  assertSplitAtFractionFails(source, 70, 0.3, options);
  assertSplitAtFractionFails(source, 100, 1.0, options);
  assertSplitAtFractionFails(source, 100, 0.99, options);
  assertSplitAtFractionSucceedsAndConsistent(source, 100, 0.995, options);
}

Source File: StreamingDataGenerator.java From DataflowTemplates with Apache License 2.0

6 votes

@Setup
public void setup() throws IOException {
  dataGenerator = new JsonDataGeneratorImpl();

  Metadata metadata = FileSystems.matchSingleFileSpec(schemaLocation);

  // Copy the schema file into a string which can be used for generation.
  try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
    try (ReadableByteChannel readerChannel = FileSystems.open(metadata.resourceId())) {
      try (WritableByteChannel writerChannel = Channels.newChannel(byteArrayOutputStream)) {
        ByteStreams.copy(readerChannel, writerChannel);
      }
    }

    schema = byteArrayOutputStream.toString();
  }
}

Source File: GeoCityLookup.java From gcp-ingestion with Mozilla Public License 2.0

6 votes

/**
 * Returns a singleton object for reading from the GeoCity database.
 *
 * <p>We copy the configured database file to a static temp location so that the MaxMind API can
 * save on heap usage by using memory mapping. The reader is threadsafe and this singleton pattern
 * allows multiple worker threads on the same machine to share a single reader instance.
 *
 * <p>Note that we do not clean up the temp mmdb file, but it's a static path, so running locally
 * will overwrite the existing path every time rather than creating an unbounded number of copies.
 * This also assumes that only one JVM per machine is running this code. In the production case
 * where this is running on Cloud Dataflow, we should always have a clean environment and the temp
 * state will be cleaned up along with the workers once the job finishes. However, behavior is
 * undefined if you run multiple local jobs concurrently.
 *
 * @throws IOException if the configured file path is not a valid .mmdb file
 */
private static synchronized DatabaseReader getOrCreateSingletonGeoCityReader(
    ValueProvider<String> geoCityDatabase) throws IOException {
  if (singletonGeoCityReader == null) {
    File mmdb;
    try {
      InputStream inputStream;
      Metadata metadata = FileSystems.matchSingleFileSpec(geoCityDatabase.get());
      ReadableByteChannel channel = FileSystems.open(metadata.resourceId());
      inputStream = Channels.newInputStream(channel);
      Path mmdbPath = Paths.get(System.getProperty("java.io.tmpdir"), "GeoCityLookup.mmdb");
      Files.copy(inputStream, mmdbPath, StandardCopyOption.REPLACE_EXISTING);
      mmdb = mmdbPath.toFile();
    } catch (IOException e) {
      throw new IOException("Exception thrown while fetching configured geoCityDatabase", e);
    }
    singletonGeoCityReader = new DatabaseReader.Builder(mmdb).withCache(new CHMCache()).build();
  }
  return singletonGeoCityReader;
}

Source File: BulkCompressorTest.java From DataflowTemplates with Apache License 2.0

6 votes

/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}

Source File: HadoopFileSystemTest.java From beam with Apache License 2.0

6 votes

@Test
public void testMatchDirectory() throws Exception {
  create("dir/file", "data".getBytes(StandardCharsets.UTF_8));
  final MatchResult matchResult =
      Iterables.getOnlyElement(
          fileSystem.match(Collections.singletonList(testPath("dir").toString())));
  assertThat(
      matchResult,
      equalTo(
          MatchResult.create(
              Status.OK,
              ImmutableList.of(
                  Metadata.builder()
                      .setResourceId(testPath("dir"))
                      .setIsReadSeekEfficient(true)
                      .setSizeBytes(0L)
                      .setLastModifiedMillis(lastModified("dir"))
                      .build()))));
}

Source File: FileBasedSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testReadRangeAtStart() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  List<String> data = createStringDataset(3, 50);

  String fileName = "file";
  File file = createFileWithData(fileName, data);

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 25, null);
  TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 25, Long.MAX_VALUE, null);

  List<String> results = new ArrayList<>();
  results.addAll(readFromSource(source1, options));
  results.addAll(readFromSource(source2, options));
  assertThat(data, containsInAnyOrder(results.toArray()));
}

Source File: FilePatternMatchingShardedFile.java From beam with Apache License 2.0

6 votes

/**
 * Reads all the lines of all the files.
 *
 * <p>Not suitable for use except in testing of small data, since the data size may be far more
 * than can be reasonably processed serially, in-memory, by a single thread.
 */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
  List<String> allLines = Lists.newArrayList();
  int i = 1;
  for (Metadata file : files) {
    try (Reader reader =
        Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
      List<String> lines = CharStreams.readLines(reader);
      allLines.addAll(lines);
      LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
    }
    i++;
  }
  return allLines;
}

Source File: NumberedShardedFile.java From beam with Apache License 2.0

6 votes

/**
 * Reads all the lines of all the files.
 *
 * <p>Not suitable for use except in testing of small data, since the data size may be far more
 * than can be reasonably processed serially, in-memory, by a single thread.
 */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
  List<String> allLines = Lists.newArrayList();
  int i = 1;
  for (Metadata file : files) {
    try (Reader reader =
        Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
      List<String> lines = CharStreams.readLines(reader);
      allLines.addAll(lines);
      LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
    }
    i++;
  }
  return allLines;
}

Source File: NumberedShardedFile.java From beam with Apache License 2.0

6 votes

/**
 * Check if total number of files is correct by comparing with the number that is parsed from
 * shard name using a name template. If no template is specified, "SSSS-of-NNNN" will be used as
 * default, and "NNNN" will be the expected total number of files.
 *
 * @return {@code true} if at least one shard name matches template and total number of given
 *     files equals the number that is parsed from shard name.
 */
@VisibleForTesting
boolean checkTotalNumOfFiles(Collection<Metadata> files) {
  for (Metadata fileMedadata : files) {
    String fileName = fileMedadata.resourceId().getFilename();

    if (fileName == null) {
      // this path has zero elements
      continue;
    }
    Matcher matcher = shardTemplate.matcher(fileName);
    if (!matcher.matches()) {
      // shard name doesn't match the pattern, check with the next shard
      continue;
    }
    // once match, extract total number of shards and compare to file list
    return files.size() == Integer.parseInt(matcher.group("numshards"));
  }
  return false;
}

Source File: AvroSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testCreateFromMetadata() throws Exception {
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
  String codec = DataFileConstants.NULL_CODEC;
  String filename =
      generateTestFile(
          codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
  Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);

  AvroSource<GenericRecord> source = AvroSource.from(fileMeta);
  AvroSource<Bird> sourceWithSchema = source.withSchema(Bird.class);
  AvroSource<Bird> sourceWithSchemaWithMinBundleSize = sourceWithSchema.withMinBundleSize(1234);

  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, source.getMode());
  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchema.getMode());
  assertEquals(
      FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchemaWithMinBundleSize.getMode());
}

Source File: AvroSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testReadMetadataWithCodecs() throws Exception {
  // Test reading files generated using all codecs.
  String[] codecs = {
    DataFileConstants.NULL_CODEC,
    DataFileConstants.BZIP2_CODEC,
    DataFileConstants.DEFLATE_CODEC,
    DataFileConstants.SNAPPY_CODEC,
    DataFileConstants.XZ_CODEC
  };
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);

  for (String codec : codecs) {
    String filename =
        generateTestFile(
            codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);

    Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
    AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
    assertEquals(codec, metadata.getCodec());
  }
}

Source File: FileSystems.java From beam with Apache License 2.0

6 votes

/**
 * Returns the {@link Metadata} for a single file resource. Expects a resource specification
 * {@code spec} that matches a single result.
 *
 * @param spec a resource specification that matches exactly one result.
 * @return the {@link Metadata} for the specified resource.
 * @throws FileNotFoundException if the file resource is not found.
 * @throws IOException in the event of an error in the inner call to {@link #match}, or if the
 *     given spec does not match exactly 1 result.
 */
public static Metadata matchSingleFileSpec(String spec) throws IOException {
  List<MatchResult> matches = FileSystems.match(Collections.singletonList(spec));
  MatchResult matchResult = Iterables.getOnlyElement(matches);
  if (matchResult.status() == Status.NOT_FOUND) {
    throw new FileNotFoundException(String.format("File spec %s not found", spec));
  } else if (matchResult.status() != Status.OK) {
    throw new IOException(
        String.format("Error matching file spec %s: status %s", spec, matchResult.status()));
  } else {
    List<Metadata> metadata = matchResult.metadata();
    if (metadata.size() != 1) {
      throw new IOException(
          String.format(
              "Expecting spec %s to match exactly one file, but matched %s: %s",
              spec, metadata.size(), metadata));
    }
    return metadata.get(0);
  }
}

Source File: TFRecordIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testReadFilesNamed() {
  readPipeline.enableAbandonedNodeEnforcement(false);

  Metadata metadata =
      Metadata.builder()
          .setResourceId(FileSystems.matchNewResource("file", false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .build();
  Create.Values<ReadableFile> create = Create.of(new ReadableFile(metadata, Compression.AUTO));

  assertEquals(
      "TFRecordIO.ReadFiles/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output",
      readPipeline.apply(create).apply(TFRecordIO.readFiles()).getName());
  assertEquals(
      "MyRead/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output",
      readPipeline.apply(create).apply("MyRead", TFRecordIO.readFiles()).getName());
}

Source File: AvroTableFileAsMutationsTest.java From DataflowTemplates with Apache License 2.0

6 votes

private PCollection<FileShard> runFileShardingPipeline(Metadata fileMetadata, int splitSize) {

    PCollectionView<Map<String, String>> filenamesToTableNamesMapView =
        p.apply(
                "Create File/Table names Map",
                Create.of(
                    ImmutableMap.<String, String>of(
                        fileMetadata.resourceId().toString(), "testtable")))
            .apply(View.asMap());

    return p.apply("Create Metadata", Create.of(fileMetadata))
        .apply(FileIO.readMatches())
        // Pcollection<FileIO.ReadableFile>
        .apply(
            "Split into ranges",
            ParDo.of(new SplitIntoRangesFn(splitSize, filenamesToTableNamesMapView))
                .withSideInputs(filenamesToTableNamesMapView))
        .setCoder(FileShard.Coder.of());
  }

Source File: HadoopFileSystemTest.java From beam with Apache License 2.0

5 votes

@Test
public void testRename() throws Exception {
  create("testFileA", "testDataA".getBytes(StandardCharsets.UTF_8));
  create("testFileB", "testDataB".getBytes(StandardCharsets.UTF_8));

  // ensure files exist
  assertArrayEquals("testDataA".getBytes(StandardCharsets.UTF_8), read("testFileA", 0));
  assertArrayEquals("testDataB".getBytes(StandardCharsets.UTF_8), read("testFileB", 0));

  fileSystem.rename(
      ImmutableList.of(testPath("testFileA"), testPath("testFileB")),
      ImmutableList.of(testPath("renameFileA"), testPath("renameFileB")));

  List<MatchResult> results = fileSystem.match(ImmutableList.of(testPath("*").toString()));
  assertEquals(Status.OK, Iterables.getOnlyElement(results).status());
  assertThat(
      Iterables.getOnlyElement(results).metadata(),
      containsInAnyOrder(
          Metadata.builder()
              .setResourceId(testPath("renameFileA"))
              .setIsReadSeekEfficient(true)
              .setSizeBytes("testDataA".getBytes(StandardCharsets.UTF_8).length)
              .setLastModifiedMillis(lastModified("renameFileA"))
              .build(),
          Metadata.builder()
              .setResourceId(testPath("renameFileB"))
              .setIsReadSeekEfficient(true)
              .setSizeBytes("testDataB".getBytes(StandardCharsets.UTF_8).length)
              .setLastModifiedMillis(lastModified("renameFileB"))
              .build()));

  // ensure files exist
  assertArrayEquals("testDataA".getBytes(StandardCharsets.UTF_8), read("renameFileA", 0));
  assertArrayEquals("testDataB".getBytes(StandardCharsets.UTF_8), read("renameFileB", 0));
}

Source File: MetadataCoderTest.java From beam with Apache License 2.0

5 votes

@Test(expected = AssertionError.class)
public void testEncodeDecodeWithCustomLastModifiedMills() throws Exception {
  Path filePath = tmpFolder.newFile("somefile").toPath();
  Metadata metadata =
      Metadata.builder()
          .setResourceId(
              FileSystems.matchNewResource(filePath.toString(), false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .setLastModifiedMillis(1541097000L)
          .build();
  // This should throw because the decoded Metadata has default lastModifiedMills.
  CoderProperties.coderDecodeEncodeEqual(MetadataCoder.of(), metadata);
}

Source File: MetadataCoderV2Test.java From beam with Apache License 2.0

5 votes

@Test
public void testEncodeDecodeWithDefaultLastModifiedMills() throws Exception {
  Path filePath = tmpFolder.newFile("somefile").toPath();
  Metadata metadata =
      Metadata.builder()
          .setResourceId(
              FileSystems.matchNewResource(filePath.toString(), false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .build();
  CoderProperties.coderDecodeEncodeEqual(MetadataCoderV2.of(), metadata);
}

Source File: MetadataCoderTest.java From beam with Apache License 2.0

5 votes

@Test
public void testEncodeDecodeWithDefaultLastModifiedMills() throws Exception {
  Path filePath = tmpFolder.newFile("somefile").toPath();
  Metadata metadata =
      Metadata.builder()
          .setResourceId(
              FileSystems.matchNewResource(filePath.toString(), false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .build();
  CoderProperties.coderDecodeEncodeEqual(MetadataCoder.of(), metadata);
}

Source File: Transforms.java From nomulus with Apache License 2.0

5 votes

/**
 * Returns a {@link PTransform} from file name patterns to file {@link Metadata Metadata records}.
 */
public static PTransform<PCollection<String>, PCollection<Metadata>> getFilesByPatterns() {
  return new PTransform<PCollection<String>, PCollection<Metadata>>() {
    @Override
    public PCollection<Metadata> expand(PCollection<String> input) {
      return input.apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW));
    }
  };
}

Source File: FileBasedSource.java From beam with Apache License 2.0

5 votes

@Override
public final long getMaxEndOffset(PipelineOptions options) throws IOException {
  checkArgument(
      mode != Mode.FILEPATTERN, "Cannot determine the exact end offset of a file pattern");
  Metadata metadata = getSingleFileMetadata();
  return metadata.sizeBytes();
}

Source File: Transforms.java From nomulus with Apache License 2.0

5 votes

/** Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity}. */
public static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>
    loadExportDataFromFiles() {
  return processFiles(
      new BackupFileReader(
          file ->
              Iterators.transform(
                  LevelDbLogReader.from(file.open()),
                  (byte[] bytes) -> VersionedEntity.from(EXPORT_ENTITY_TIME_STAMP, bytes))));
}

Source File: FileBasedSourceTest.java From beam with Apache License 2.0

5 votes

public TestFileBasedSource(
    Metadata fileOrPattern,
    long minBundleSize,
    long startOffset,
    long endOffset,
    @Nullable String splitHeader) {
  super(fileOrPattern, minBundleSize, startOffset, endOffset);
  this.splitHeader = splitHeader;
}

Source File: FileBasedSource.java From beam with Apache License 2.0

5 votes

/**
 * Returns the information about the single file that this source is reading from.
 *
 * @throws IllegalArgumentException if this source is in {@link Mode#FILEPATTERN} mode.
 */
protected final MatchResult.Metadata getSingleFileMetadata() {
  checkArgument(
      mode == Mode.SINGLE_FILE_OR_SUBRANGE,
      "This function should only be called for a single file, not %s",
      this);
  checkState(
      singleFileMetadata != null,
      "It should not be possible to construct a %s in mode %s with null metadata: %s",
      FileBasedSource.class,
      mode,
      this);
  return singleFileMetadata;
}

Source File: CompressedSource.java From beam with Apache License 2.0

5 votes

/**
 * Creates a {@code CompressedSource} for a subrange of a file. Called by superclass to create a
 * source for a single file.
 */
@Override
protected FileBasedSource<T> createForSubrangeOfFile(Metadata metadata, long start, long end) {
  return new CompressedSource<>(
      sourceDelegate.createForSubrangeOfFile(metadata, start, end),
      channelFactory,
      metadata,
      sourceDelegate.getMinBundleSize(),
      start,
      end);
}

Source File: FileBasedSourceTest.java From beam with Apache License 2.0

5 votes

@Test
public void testSplitAtFractionExhaustive() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  // Smaller file for exhaustive testing.
  File file = createFileWithData("file", createStringDataset(3, 20));

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
  assertSplitAtFractionExhaustive(source, options);
}

Source File: FileBasedSourceTest.java From beam with Apache License 2.0

5 votes

@Test
public void testReadRangeFromFileWithSplitsFromStart() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  String header = "<h>";
  List<String> data = new ArrayList<>();
  for (int i = 0; i < 10; i++) {
    data.add(header);
    data.addAll(createStringDataset(3, 9));
  }
  String fileName = "file";
  File file = createFileWithData(fileName, data);

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 60, header);
  TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 60, Long.MAX_VALUE, header);

  List<String> expectedResults = new ArrayList<>();
  expectedResults.addAll(data);
  // Remove all occurrences of header from expected results.
  expectedResults.removeAll(Arrays.asList(header));

  List<String> results = new ArrayList<>();
  results.addAll(readFromSource(source1, options));
  results.addAll(readFromSource(source2, options));

  assertThat(expectedResults, containsInAnyOrder(results.toArray()));
}

Source File: FileBasedSourceTest.java From beam with Apache License 2.0

5 votes

@Test
public void testReadRangeFromFileWithSplitsFromMiddle() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  String header = "<h>";
  List<String> data = new ArrayList<>();
  for (int i = 0; i < 10; i++) {
    data.add(header);
    data.addAll(createStringDataset(3, 9));
  }
  String fileName = "file";
  File file = createFileWithData(fileName, data);

  Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
  TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 42, header);
  TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 42, 112, header);
  TestFileBasedSource source3 =
      new TestFileBasedSource(metadata, 64, 112, Long.MAX_VALUE, header);

  List<String> expectedResults = new ArrayList<>();

  expectedResults.addAll(data);
  // Remove all occurrences of header from expected results.
  expectedResults.removeAll(Collections.singletonList(header));

  List<String> results = new ArrayList<>();
  results.addAll(readFromSource(source1, options));
  results.addAll(readFromSource(source2, options));
  results.addAll(readFromSource(source3, options));

  assertThat(expectedResults, containsInAnyOrder(results.toArray()));
}

org.apache.beam.sdk.io.fs.MatchResult.Metadata Java Examples