org.apache.beam.sdk.io.FileSystems Java Examples

The following examples show how to use org.apache.beam.sdk.io.FileSystems. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NexmarkUtils.java    From beam with Apache License 2.0 7 votes vote down vote up
/** Frees any resources used to make the side input available. */
public static void cleanUpSideInput(NexmarkConfiguration config) throws IOException {
  switch (config.sideInputType) {
    case DIRECT:
      break;
    case CSV:
      FileSystems.delete(
          FileSystems.match(config.sideInputUrl + "*").metadata().stream()
              .map(metadata -> metadata.resourceId())
              .collect(Collectors.toList()));
      break;
    default:
      throw new IllegalArgumentException(
          String.format(
              "Unknown type of %s clean up requested", SideInputType.class.getSimpleName()));
  }
}
 
Example #2
Source File: AvroByteReader.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public AvroByteFileIterator iterator() throws IOException {
  BoundedSource.BoundedReader<ByteBuffer> reader;
  if (startPosition == 0 && endPosition == Long.MAX_VALUE) {
    // Read entire file (or collection of files).
    reader = avroSource.createReader(options);
  } else {
    // Read a subrange of file.
    reader =
        avroSource
            .createForSubrangeOfFile(
                FileSystems.matchSingleFileSpec(filename), startPosition, endPosition)
            .createReader(options);
  }
  return new AvroByteFileIterator((AvroReader<ByteBuffer>) reader);
}
 
Example #3
Source File: PackageUtil.java    From beam with Apache License 2.0 6 votes vote down vote up
public static PackageAttributes forBytesToStage(
    byte[] bytes, String targetName, String stagingPath) {
  HashCode hashCode = Hashing.sha256().newHasher().putBytes(bytes).hash();
  long size = bytes.length;

  String target = Environments.createStagingFileName(new File(targetName), hashCode);

  String resourcePath =
      FileSystems.matchNewResource(stagingPath, true)
          .resolve(target, StandardResolveOptions.RESOLVE_FILE)
          .toString();
  DataflowPackage targetPackage = new DataflowPackage();
  targetPackage.setName(target);
  targetPackage.setLocation(resourcePath);

  return new AutoValue_PackageUtil_PackageAttributes(
      null, bytes, targetPackage, size, hashCode.toString());
}
 
Example #4
Source File: FileUtils.java    From deployment-examples with MIT License 6 votes vote down vote up
public static String copyFile(ResourceId sourceFile, ResourceId destinationFile)
    throws IOException {

  try (WritableByteChannel writeChannel = FileSystems.create(destinationFile, "text/plain")) {
    try (ReadableByteChannel readChannel = FileSystems.open(sourceFile)) {

      final ByteBuffer buffer = ByteBuffer.allocateDirect(16 * 1024);
      while (readChannel.read(buffer) != -1) {
        buffer.flip();
        writeChannel.write(buffer);
        buffer.compact();
      }
      buffer.flip();
      while (buffer.hasRemaining()) {
        writeChannel.write(buffer);
      }
    }
  }

  return destinationFile.toString();
}
 
Example #5
Source File: IsmReaderFactoryTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testFactory() throws Exception {
  WindowedValueCoder<?> coder =
      WindowedValue.getFullCoder(
          IsmRecordCoder.of(
              1, 0, ImmutableList.<Coder<?>>of(StringUtf8Coder.of()), VarLongCoder.of()),
          GlobalWindow.Coder.INSTANCE);

  String tmpFile = tmpFolder.newFile().getPath();
  ResourceId tmpResourceId = FileSystems.matchSingleFileSpec(tmpFile).resourceId();
  @SuppressWarnings("rawtypes")
  IsmReader<?> ismReader =
      (IsmReader)
          new IsmReaderFactory()
              .create(
                  createSpecForFilename(tmpFile),
                  coder,
                  options,
                  executionContext,
                  operationContext);
  assertEquals(coder.getValueCoder(), ismReader.getCoder());
  assertEquals(tmpResourceId, ismReader.getResourceId());
}
 
Example #6
Source File: CsvToAvro.java    From java-docs-samples with Apache License 2.0 6 votes vote down vote up
public static String getSchema(String schemaPath) throws IOException {
  ReadableByteChannel chan = FileSystems.open(FileSystems.matchNewResource(
      schemaPath, false));

  try (InputStream stream = Channels.newInputStream(chan)) {
    BufferedReader streamReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
    StringBuilder dataBuilder = new StringBuilder();

    String line;
    while ((line = streamReader.readLine()) != null) {
      dataBuilder.append(line);
    }

    return dataBuilder.toString();
  }
}
 
Example #7
Source File: BeamJpaModuleTest.java    From nomulus with Apache License 2.0 6 votes vote down vote up
/**
 * Integration test with a GCP project, only run when the 'test.gcp_integration.env' property is
 * defined. Otherwise this test is ignored. This is meant to be run from a developer's desktop,
 * with auth already set up by gcloud.
 *
 * <p>Example: {@code gradlew test -P test.gcp_integration.env=alpha}.
 *
 * <p>See <a href="../../../../../../../../java_common.gradle">java_common.gradle</a> for more
 * information.
 */
@Test
public void getJpaTransactionManager_cloudSql_authRequired() {
  String environmentName = System.getProperty("test.gcp_integration.env");
  assumeThat(environmentName, notNullValue());

  FileSystems.setDefaultPipelineOptions(PipelineOptionsFactory.create());
  JpaTransactionManager jpa =
      DaggerBeamJpaModule_JpaTransactionManagerComponent.builder()
          .beamJpaModule(
              new BeamJpaModule(
                  BackupPaths.getCloudSQLCredentialFilePatterns(environmentName).get(0)))
          .build()
          .cloudSqlJpaTransactionManager();
  assertThat(
          jpa.transact(
              () -> jpa.getEntityManager().createNativeQuery("select 1").getSingleResult()))
      .isEqualTo(1);
}
 
Example #8
Source File: ArtifactRetrievalService.java    From beam with Apache License 2.0 6 votes vote down vote up
public static InputStream getArtifact(RunnerApi.ArtifactInformation artifact) throws IOException {
  switch (artifact.getTypeUrn()) {
    case FILE_ARTIFACT_URN:
      RunnerApi.ArtifactFilePayload payload =
          RunnerApi.ArtifactFilePayload.parseFrom(artifact.getTypePayload());
      return Channels.newInputStream(
          FileSystems.open(
              FileSystems.matchNewResource(payload.getPath(), false /* is directory */)));
    case EMBEDDED_ARTIFACT_URN:
      return RunnerApi.EmbeddedFilePayload.parseFrom(artifact.getTypePayload())
          .getData()
          .newInput();
    default:
      throw new UnsupportedOperationException(
          "Unexpected artifact type: " + artifact.getTypeUrn());
  }
}
 
Example #9
Source File: GeoCityLookup.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Returns a singleton object for reading from the GeoCity database.
 *
 * <p>We copy the configured database file to a static temp location so that the MaxMind API can
 * save on heap usage by using memory mapping. The reader is threadsafe and this singleton pattern
 * allows multiple worker threads on the same machine to share a single reader instance.
 *
 * <p>Note that we do not clean up the temp mmdb file, but it's a static path, so running locally
 * will overwrite the existing path every time rather than creating an unbounded number of copies.
 * This also assumes that only one JVM per machine is running this code. In the production case
 * where this is running on Cloud Dataflow, we should always have a clean environment and the temp
 * state will be cleaned up along with the workers once the job finishes. However, behavior is
 * undefined if you run multiple local jobs concurrently.
 *
 * @throws IOException if the configured file path is not a valid .mmdb file
 */
private static synchronized DatabaseReader getOrCreateSingletonGeoCityReader(
    ValueProvider<String> geoCityDatabase) throws IOException {
  if (singletonGeoCityReader == null) {
    File mmdb;
    try {
      InputStream inputStream;
      Metadata metadata = FileSystems.matchSingleFileSpec(geoCityDatabase.get());
      ReadableByteChannel channel = FileSystems.open(metadata.resourceId());
      inputStream = Channels.newInputStream(channel);
      Path mmdbPath = Paths.get(System.getProperty("java.io.tmpdir"), "GeoCityLookup.mmdb");
      Files.copy(inputStream, mmdbPath, StandardCopyOption.REPLACE_EXISTING);
      mmdb = mmdbPath.toFile();
    } catch (IOException e) {
      throw new IOException("Exception thrown while fetching configured geoCityDatabase", e);
    }
    singletonGeoCityReader = new DatabaseReader.Builder(mmdb).withCache(new CHMCache()).build();
  }
  return singletonGeoCityReader;
}
 
Example #10
Source File: BigQueryIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
void cleanup(PassThroughThenCleanup.ContextContainer c) throws Exception {
  PipelineOptions options = c.getPipelineOptions();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  String jobUuid = c.getJobId();
  final String extractDestinationDir =
      resolveTempLocation(bqOptions.getTempLocation(), "BigQueryExtractTemp", jobUuid);
  final String executingProject = bqOptions.getProject();
  JobReference jobRef =
      new JobReference()
          .setProjectId(executingProject)
          .setJobId(getExtractJobId(createJobIdToken(bqOptions.getJobName(), jobUuid)));

  Job extractJob = getBigQueryServices().getJobService(bqOptions).getJob(jobRef);

  if (extractJob != null) {
    List<ResourceId> extractFiles =
        getExtractFilePaths(extractDestinationDir, extractJob);
    if (extractFiles != null && !extractFiles.isEmpty()) {
      FileSystems.delete(
          extractFiles, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
    }
  }
}
 
Example #11
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private static DataflowPipelineOptions buildPipelineOptions() throws IOException {
  GcsUtil mockGcsUtil = mock(GcsUtil.class);
  when(mockGcsUtil.expand(any(GcsPath.class)))
      .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
  when(mockGcsUtil.bucketAccessible(any(GcsPath.class))).thenReturn(true);

  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setGcpCredential(new TestCredential());
  options.setJobName("some-job-name");
  options.setProject("some-project");
  options.setRegion("some-region");
  options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString());
  options.setFilesToStage(new ArrayList<>());
  options.setDataflowClient(buildMockDataflow(new IsValidCreateRequest()));
  options.setGcsUtil(mockGcsUtil);

  // Enable the FileSystems API to know about gs:// URIs in this test.
  FileSystems.setDefaultPipelineOptions(options);

  return options;
}
 
Example #12
Source File: FilePatternMatchingShardedFile.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Reads all the lines of all the files.
 *
 * <p>Not suitable for use except in testing of small data, since the data size may be far more
 * than can be reasonably processed serially, in-memory, by a single thread.
 */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
  List<String> allLines = Lists.newArrayList();
  int i = 1;
  for (Metadata file : files) {
    try (Reader reader =
        Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
      List<String> lines = CharStreams.readLines(reader);
      allLines.addAll(lines);
      LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
    }
    i++;
  }
  return allLines;
}
 
Example #13
Source File: WindowedFilenamePolicy.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Resolves any date variables which exist in the output directory path. This allows for the
 * dynamically changing of the output location based on the window end time.
 *
 * @return The new output directory with all variables resolved.
 */
private ResourceId resolveWithDateTemplates(
        ValueProvider<String> outputDirectoryStr, BoundedWindow window) {
    ResourceId outputDirectory = FileSystems.matchNewResource(outputDirectoryStr.get(), true);
    if (window instanceof IntervalWindow) {
        IntervalWindow intervalWindow = (IntervalWindow) window;
        DateTime time = intervalWindow.end().toDateTime();
        String outputPath = outputDirectory.toString();
        outputPath = outputPath.replace("YYYY", YEAR.print(time));
        outputPath = outputPath.replace("MM", MONTH.print(time));
        outputPath = outputPath.replace("DD", DAY.print(time));
        outputPath = outputPath.replace("HH", HOUR.print(time));
        outputPath = outputPath.replace("mm", MINUTE.print(time));
        outputDirectory = FileSystems.matchNewResource(outputPath, true);
    }
    return outputDirectory;
}
 
Example #14
Source File: GCSUploadMain.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  DataflowPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).as(DataflowPipelineOptions.class);
  FileSystems.setDefaultPipelineOptions(options);
  GcsStager stager = GcsStager.fromOptions(options);
  stager.stageFiles(
      options.getFilesToStage().stream()
          .map(
              (String source) -> {
                try {
                  File file = new File(source);
                  HashCode hashCode = Files.asByteSource(file).hash(Hashing.sha256());
                  return PackageUtil.StagedFile.of(
                      source,
                      hashCode.toString(),
                      Environments.createStagingFileName(file, hashCode));
                } catch (IOException e) {
                  throw new UncheckedIOException(e);
                }
              })
          .collect(Collectors.toList()));
}
 
Example #15
Source File: TextToBigQueryStreaming.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Method to read a BigQuery schema file from GCS and return the file contents as a string.
 *
 * @param gcsPath Path string for the schema file in GCS.
 * @return File contents as a string.
 */
private static ValueProvider<String> getSchemaFromGCS(ValueProvider<String> gcsPath) {
  return NestedValueProvider.of(
      gcsPath,
      new SimpleFunction<String, String>() {
        @Override
        public String apply(String input) {
          ResourceId sourceResourceId = FileSystems.matchNewResource(input, false);

          String schema;
          try (ReadableByteChannel rbc = FileSystems.open(sourceResourceId)) {
            try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
              try (WritableByteChannel wbc = Channels.newChannel(baos)) {
                ByteStreams.copy(rbc, wbc);
                schema = baos.toString(Charsets.UTF_8.name());
                LOG.info("Extracted schema: " + schema);
              }
            }
          } catch (IOException e) {
            LOG.error("Error extracting schema: " + e.getMessage());
            throw new RuntimeException(e);
          }
          return schema;
        }
      });
}
 
Example #16
Source File: TestUtils.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Helper to generate files for testing.
 *
 * @param filePath The path to the file to write.
 * @param lines The lines to write.
 * @param compression The compression type of the file.
 * @return The file written.
 * @throws IOException If an error occurs while creating or writing the file.
 */
public static ResourceId writeToFile(
    String filePath, List<String> lines, Compression compression) throws IOException {

  String fileContents = String.join(System.lineSeparator(), lines);

  ResourceId resourceId = FileSystems.matchNewResource(filePath, false);

  String mimeType =
      compression == Compression.UNCOMPRESSED ? MimeTypes.TEXT : MimeTypes.BINARY;

  // Write the file contents to the channel and close.
  try (ReadableByteChannel readChannel =
      Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) {
    try (WritableByteChannel writeChannel =
        compression.writeCompressed(FileSystems.create(resourceId, mimeType))) {
      ByteStreams.copy(readChannel, writeChannel);
    }
  }

  return resourceId;
}
 
Example #17
Source File: IsmReaderTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadMissingKeys() throws Exception {
  File tmpFile = tmpFolder.newFile();
  List<IsmRecord<byte[]>> data = new ArrayList<>();
  data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] {0x04}), EMPTY));
  data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] {0x08}), EMPTY));
  writeElementsToFile(data, tmpFile);

  IsmReader<byte[]> reader =
      new IsmReaderImpl<byte[]>(
          FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);

  // Check that we got false with a key before all keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x02})).start());
  // Check that we got false with a key between two other keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x06})).start());
  // Check that we got false with a key that is after all keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x10})).start());
}
 
Example #18
Source File: IsmSinkTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteNonContiguousShardsIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x00}, EMPTY), EMPTY)));
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x01}, EMPTY), EMPTY)));

  expectedException.expect(IllegalStateException.class);
  expectedException.expectMessage("for shard which already exists");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x00}, EMPTY), EMPTY)));
}
 
Example #19
Source File: BulkCompressorTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}
 
Example #20
Source File: CsvToAvro.java    From java-docs-samples with Apache License 2.0 6 votes vote down vote up
public static void runCsvToAvro(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert CSV to Avro
  pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data",
          ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema))
      .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
          .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example #21
Source File: WindowedWordCountIT.java    From beam with Apache License 2.0 6 votes vote down vote up
private WindowedWordCountITOptions defaultOptions() throws Exception {
  WindowedWordCountITOptions options =
      TestPipeline.testingPipelineOptions().as(WindowedWordCountITOptions.class);
  options.setInputFile(DEFAULT_INPUT);
  options.setTestTimeoutSeconds(1200L);

  options.setMinTimestampMillis(0L);
  options.setMinTimestampMillis(Duration.standardHours(1).getMillis());
  options.setWindowSize(10);

  options.setOutput(
      FileSystems.matchNewResource(options.getTempRoot(), true)
          .resolve(
              String.format(
                  "WindowedWordCountIT.%s-%tFT%<tH:%<tM:%<tS.%<tL+%s",
                  testName.getMethodName(), new Date(), ThreadLocalRandom.current().nextInt()),
              StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("output", StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("results", StandardResolveOptions.RESOLVE_FILE)
          .toString());
  return options;
}
 
Example #22
Source File: TfIdfIT.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testE2ETfIdf() throws Exception {
  TfIdfITOptions options = TestPipeline.testingPipelineOptions().as(TfIdfITOptions.class);
  options.setInput(DEFAULT_INPUT);
  options.setOutput(
      FileSystems.matchNewResource(options.getTempRoot(), true)
          .resolve(
              String.format("TfIdfIT-%tF-%<tH-%<tM-%<tS-%<tL", new Date()),
              StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("output", StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("results", StandardResolveOptions.RESOLVE_FILE)
          .toString());
  TfIdf.runTfIdf(options);

  assertThat(
      new NumberedShardedFile(options.getOutput() + "*-of-*.csv", DEFAULT_SHARD_TEMPLATE),
      fileContentsHaveChecksum(EXPECTED_OUTPUT_CHECKSUM));
}
 
Example #23
Source File: IsmSinkTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteOutOfOrderKeysWithSameShardKeyIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x01}), EMPTY)));

  expectedException.expect(IllegalArgumentException.class);
  expectedException.expectMessage("expects keys to be written in strictly increasing order");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00}), EMPTY)));
}
 
Example #24
Source File: IsmSinkTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteKeyWhichIsProperPrefixOfPreviousSecondaryKeyIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(
          IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00, 0x00}), EMPTY)));

  expectedException.expect(IllegalArgumentException.class);
  expectedException.expectMessage("expects keys to be written in strictly increasing order");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00}), EMPTY)));
}
 
Example #25
Source File: IsmSinkTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteEmptyKeyWithValueLargerThanBlockSize() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          IsmRecordCoder.of(
              1, // We hash using only the window
              0, // There are no metadata records
              // We specifically use a coder that encodes to 0 bytes.
              ImmutableList.<Coder<?>>of(VoidCoder.of()),
              ByteArrayCoder.of()),
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(
          IsmRecord.of(
              Arrays.asList(new Object[] {null}), new byte[IsmSink.BLOCK_SIZE_BYTES * 2])));
  sinkWriter.close();
}
 
Example #26
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Setup
public void setup() throws Exception {
  if (spec.getTikaConfigPath() != null) {
    ResourceId configResource =
        FileSystems.matchSingleFileSpec(spec.getTikaConfigPath().get()).resourceId();
    tikaConfig = new TikaConfig(Channels.newInputStream(FileSystems.open(configResource)));
  }
}
 
Example #27
Source File: AvroByteReaderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Write input elements to a file and return information about the Avro-encoded file. */
private <T> AvroFileInfo<T> initInputFile(List<List<T>> elemsList, Coder<T> coder)
    throws Exception {
  File tmpFile = tmpFolder.newFile("file.avro");
  AvroFileInfo<T> fileInfo = new AvroFileInfo<>();
  fileInfo.filename = tmpFile.getPath();

  // Write the data.
  OutputStream outStream =
      Channels.newOutputStream(
          FileSystems.create(
              FileSystems.matchNewResource(fileInfo.filename, false), MimeTypes.BINARY));
  Schema schema = Schema.create(Schema.Type.BYTES);
  DatumWriter<ByteBuffer> datumWriter = new GenericDatumWriter<>(schema);
  try (DataFileWriter<ByteBuffer> fileWriter = new DataFileWriter<>(datumWriter)) {
    fileWriter.create(schema, outStream);
    boolean first = true;
    for (List<T> elems : elemsList) {
      if (first) {
        first = false;
      } else {
        // Ensure a block boundary here.
        long syncPoint = fileWriter.sync();
        fileInfo.syncPoints.add(syncPoint);
      }
      for (T elem : elems) {
        byte[] encodedElement = CoderUtils.encodeToByteArray(coder, elem);
        fileWriter.append(ByteBuffer.wrap(encodedElement));
        fileInfo.elementSizes.add(encodedElement.length);
        fileInfo.totalElementEncodedSize += encodedElement.length;
      }
    }
  }

  return fileInfo;
}
 
Example #28
Source File: SnowflakeIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws IOException {
  String combinedPath = stagingBucketDir + "/**";
  List<ResourceId> paths =
      FileSystems.match(combinedPath).metadata().stream()
          .map(metadata -> metadata.resourceId())
          .collect(Collectors.toList());

  FileSystems.delete(paths, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
}
 
Example #29
Source File: FilePatternMatchingShardedFile.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Discovers all shards of this file using the provided {@link Sleeper} and {@link BackOff}. */
@Override
public List<String> readFilesWithRetries(Sleeper sleeper, BackOff backOff)
    throws IOException, InterruptedException {
  IOException lastException = null;

  do {
    try {
      Collection<Metadata> files = FileSystems.match(filePattern).metadata();
      LOG.debug(
          "Found file(s) {} by matching the path: {}",
          files.stream()
              .map(Metadata::resourceId)
              .map(ResourceId::getFilename)
              .collect(Collectors.joining(",")),
          filePattern);
      if (files.isEmpty()) {
        continue;
      }
      // Read data from file paths
      return readLines(files);
    } catch (IOException e) {
      // Ignore and retry
      lastException = e;
      LOG.warn("Error in file reading. Ignore and retry.");
    }
  } while (BackOffUtils.next(sleeper, backOff));
  // Failed after max retries
  throw new IOException(
      String.format("Unable to read file(s) after retrying %d times", MAX_READ_RETRIES),
      lastException);
}
 
Example #30
Source File: MetadataCoderV2Test.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncodeDecodeWithCustomLastModifiedMills() throws Exception {
  Path filePath = tmpFolder.newFile("somefile").toPath();
  Metadata metadata =
      Metadata.builder()
          .setResourceId(
              FileSystems.matchNewResource(filePath.toString(), false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .setLastModifiedMillis(1541097000L)
          .build();
  CoderProperties.coderDecodeEncodeEqual(MetadataCoderV2.of(), metadata);
}