Java Code Examples for org.apache.beam.sdk.io.FileSystems

The following examples show how to use org.apache.beam.sdk.io.FileSystems. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: beam   Source File: NexmarkUtils.java    License: Apache License 2.0 7 votes vote down vote up
/** Frees any resources used to make the side input available. */
public static void cleanUpSideInput(NexmarkConfiguration config) throws IOException {
  switch (config.sideInputType) {
    case DIRECT:
      break;
    case CSV:
      FileSystems.delete(
          FileSystems.match(config.sideInputUrl + "*").metadata().stream()
              .map(metadata -> metadata.resourceId())
              .collect(Collectors.toList()));
      break;
    default:
      throw new IllegalArgumentException(
          String.format(
              "Unknown type of %s clean up requested", SideInputType.class.getSimpleName()));
  }
}
 
Example 2
Source Project: gcp-ingestion   Source File: GeoCityLookup.java    License: Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Returns a singleton object for reading from the GeoCity database.
 *
 * <p>We copy the configured database file to a static temp location so that the MaxMind API can
 * save on heap usage by using memory mapping. The reader is threadsafe and this singleton pattern
 * allows multiple worker threads on the same machine to share a single reader instance.
 *
 * <p>Note that we do not clean up the temp mmdb file, but it's a static path, so running locally
 * will overwrite the existing path every time rather than creating an unbounded number of copies.
 * This also assumes that only one JVM per machine is running this code. In the production case
 * where this is running on Cloud Dataflow, we should always have a clean environment and the temp
 * state will be cleaned up along with the workers once the job finishes. However, behavior is
 * undefined if you run multiple local jobs concurrently.
 *
 * @throws IOException if the configured file path is not a valid .mmdb file
 */
private static synchronized DatabaseReader getOrCreateSingletonGeoCityReader(
    ValueProvider<String> geoCityDatabase) throws IOException {
  if (singletonGeoCityReader == null) {
    File mmdb;
    try {
      InputStream inputStream;
      Metadata metadata = FileSystems.matchSingleFileSpec(geoCityDatabase.get());
      ReadableByteChannel channel = FileSystems.open(metadata.resourceId());
      inputStream = Channels.newInputStream(channel);
      Path mmdbPath = Paths.get(System.getProperty("java.io.tmpdir"), "GeoCityLookup.mmdb");
      Files.copy(inputStream, mmdbPath, StandardCopyOption.REPLACE_EXISTING);
      mmdb = mmdbPath.toFile();
    } catch (IOException e) {
      throw new IOException("Exception thrown while fetching configured geoCityDatabase", e);
    }
    singletonGeoCityReader = new DatabaseReader.Builder(mmdb).withCache(new CHMCache()).build();
  }
  return singletonGeoCityReader;
}
 
Example 3
Source Project: nomulus   Source File: BeamJpaModuleTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Integration test with a GCP project, only run when the 'test.gcp_integration.env' property is
 * defined. Otherwise this test is ignored. This is meant to be run from a developer's desktop,
 * with auth already set up by gcloud.
 *
 * <p>Example: {@code gradlew test -P test.gcp_integration.env=alpha}.
 *
 * <p>See <a href="../../../../../../../../java_common.gradle">java_common.gradle</a> for more
 * information.
 */
@Test
public void getJpaTransactionManager_cloudSql_authRequired() {
  String environmentName = System.getProperty("test.gcp_integration.env");
  assumeThat(environmentName, notNullValue());

  FileSystems.setDefaultPipelineOptions(PipelineOptionsFactory.create());
  JpaTransactionManager jpa =
      DaggerBeamJpaModule_JpaTransactionManagerComponent.builder()
          .beamJpaModule(
              new BeamJpaModule(
                  BackupPaths.getCloudSQLCredentialFilePatterns(environmentName).get(0)))
          .build()
          .cloudSqlJpaTransactionManager();
  assertThat(
          jpa.transact(
              () -> jpa.getEntityManager().createNativeQuery("select 1").getSingleResult()))
      .isEqualTo(1);
}
 
Example 4
Source Project: java-docs-samples   Source File: CsvToAvro.java    License: Apache License 2.0 6 votes vote down vote up
public static String getSchema(String schemaPath) throws IOException {
  ReadableByteChannel chan = FileSystems.open(FileSystems.matchNewResource(
      schemaPath, false));

  try (InputStream stream = Channels.newInputStream(chan)) {
    BufferedReader streamReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
    StringBuilder dataBuilder = new StringBuilder();

    String line;
    while ((line = streamReader.readLine()) != null) {
      dataBuilder.append(line);
    }

    return dataBuilder.toString();
  }
}
 
Example 5
Source Project: deployment-examples   Source File: FileUtils.java    License: MIT License 6 votes vote down vote up
public static String copyFile(ResourceId sourceFile, ResourceId destinationFile)
    throws IOException {

  try (WritableByteChannel writeChannel = FileSystems.create(destinationFile, "text/plain")) {
    try (ReadableByteChannel readChannel = FileSystems.open(sourceFile)) {

      final ByteBuffer buffer = ByteBuffer.allocateDirect(16 * 1024);
      while (readChannel.read(buffer) != -1) {
        buffer.flip();
        writeChannel.write(buffer);
        buffer.compact();
      }
      buffer.flip();
      while (buffer.hasRemaining()) {
        writeChannel.write(buffer);
      }
    }
  }

  return destinationFile.toString();
}
 
Example 6
Source Project: DataflowTemplates   Source File: WindowedFilenamePolicy.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Resolves any date variables which exist in the output directory path. This allows for the
 * dynamically changing of the output location based on the window end time.
 *
 * @return The new output directory with all variables resolved.
 */
private ResourceId resolveWithDateTemplates(
        ValueProvider<String> outputDirectoryStr, BoundedWindow window) {
    ResourceId outputDirectory = FileSystems.matchNewResource(outputDirectoryStr.get(), true);
    if (window instanceof IntervalWindow) {
        IntervalWindow intervalWindow = (IntervalWindow) window;
        DateTime time = intervalWindow.end().toDateTime();
        String outputPath = outputDirectory.toString();
        outputPath = outputPath.replace("YYYY", YEAR.print(time));
        outputPath = outputPath.replace("MM", MONTH.print(time));
        outputPath = outputPath.replace("DD", DAY.print(time));
        outputPath = outputPath.replace("HH", HOUR.print(time));
        outputPath = outputPath.replace("mm", MINUTE.print(time));
        outputDirectory = FileSystems.matchNewResource(outputPath, true);
    }
    return outputDirectory;
}
 
Example 7
Source Project: DataflowTemplates   Source File: TextToBigQueryStreaming.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Method to read a BigQuery schema file from GCS and return the file contents as a string.
 *
 * @param gcsPath Path string for the schema file in GCS.
 * @return File contents as a string.
 */
private static ValueProvider<String> getSchemaFromGCS(ValueProvider<String> gcsPath) {
  return NestedValueProvider.of(
      gcsPath,
      new SimpleFunction<String, String>() {
        @Override
        public String apply(String input) {
          ResourceId sourceResourceId = FileSystems.matchNewResource(input, false);

          String schema;
          try (ReadableByteChannel rbc = FileSystems.open(sourceResourceId)) {
            try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
              try (WritableByteChannel wbc = Channels.newChannel(baos)) {
                ByteStreams.copy(rbc, wbc);
                schema = baos.toString(Charsets.UTF_8.name());
                LOG.info("Extracted schema: " + schema);
              }
            }
          } catch (IOException e) {
            LOG.error("Error extracting schema: " + e.getMessage());
            throw new RuntimeException(e);
          }
          return schema;
        }
      });
}
 
Example 8
Source Project: beam   Source File: GCSUploadMain.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  DataflowPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).as(DataflowPipelineOptions.class);
  FileSystems.setDefaultPipelineOptions(options);
  GcsStager stager = GcsStager.fromOptions(options);
  stager.stageFiles(
      options.getFilesToStage().stream()
          .map(
              (String source) -> {
                try {
                  File file = new File(source);
                  HashCode hashCode = Files.asByteSource(file).hash(Hashing.sha256());
                  return PackageUtil.StagedFile.of(
                      source,
                      hashCode.toString(),
                      Environments.createStagingFileName(file, hashCode));
                } catch (IOException e) {
                  throw new UncheckedIOException(e);
                }
              })
          .collect(Collectors.toList()));
}
 
Example 9
Source Project: beam   Source File: IsmSinkTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteKeyWhichIsProperPrefixOfPreviousSecondaryKeyIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(
          IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00, 0x00}), EMPTY)));

  expectedException.expect(IllegalArgumentException.class);
  expectedException.expectMessage("expects keys to be written in strictly increasing order");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00}), EMPTY)));
}
 
Example 10
Source Project: DataflowTemplates   Source File: BulkCompressorTest.java    License: Apache License 2.0 6 votes vote down vote up
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}
 
Example 11
Source Project: DataflowTemplates   Source File: TestUtils.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Helper to generate files for testing.
 *
 * @param filePath The path to the file to write.
 * @param lines The lines to write.
 * @param compression The compression type of the file.
 * @return The file written.
 * @throws IOException If an error occurs while creating or writing the file.
 */
public static ResourceId writeToFile(
    String filePath, List<String> lines, Compression compression) throws IOException {

  String fileContents = String.join(System.lineSeparator(), lines);

  ResourceId resourceId = FileSystems.matchNewResource(filePath, false);

  String mimeType =
      compression == Compression.UNCOMPRESSED ? MimeTypes.TEXT : MimeTypes.BINARY;

  // Write the file contents to the channel and close.
  try (ReadableByteChannel readChannel =
      Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) {
    try (WritableByteChannel writeChannel =
        compression.writeCompressed(FileSystems.create(resourceId, mimeType))) {
      ByteStreams.copy(readChannel, writeChannel);
    }
  }

  return resourceId;
}
 
Example 12
Source Project: beam   Source File: AvroByteReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public AvroByteFileIterator iterator() throws IOException {
  BoundedSource.BoundedReader<ByteBuffer> reader;
  if (startPosition == 0 && endPosition == Long.MAX_VALUE) {
    // Read entire file (or collection of files).
    reader = avroSource.createReader(options);
  } else {
    // Read a subrange of file.
    reader =
        avroSource
            .createForSubrangeOfFile(
                FileSystems.matchSingleFileSpec(filename), startPosition, endPosition)
            .createReader(options);
  }
  return new AvroByteFileIterator((AvroReader<ByteBuffer>) reader);
}
 
Example 13
Source Project: beam   Source File: IsmReaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testReadMissingKeys() throws Exception {
  File tmpFile = tmpFolder.newFile();
  List<IsmRecord<byte[]>> data = new ArrayList<>();
  data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] {0x04}), EMPTY));
  data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] {0x08}), EMPTY));
  writeElementsToFile(data, tmpFile);

  IsmReader<byte[]> reader =
      new IsmReaderImpl<byte[]>(
          FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);

  // Check that we got false with a key before all keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x02})).start());
  // Check that we got false with a key between two other keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x06})).start());
  // Check that we got false with a key that is after all keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x10})).start());
}
 
Example 14
Source Project: beam   Source File: IsmSinkTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteNonContiguousShardsIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x00}, EMPTY), EMPTY)));
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x01}, EMPTY), EMPTY)));

  expectedException.expect(IllegalStateException.class);
  expectedException.expectMessage("for shard which already exists");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x00}, EMPTY), EMPTY)));
}
 
Example 15
Source Project: java-docs-samples   Source File: CsvToAvro.java    License: Apache License 2.0 6 votes vote down vote up
public static void runCsvToAvro(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert CSV to Avro
  pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data",
          ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema))
      .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
          .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example 16
Source Project: beam   Source File: WindowedWordCountIT.java    License: Apache License 2.0 6 votes vote down vote up
private WindowedWordCountITOptions defaultOptions() throws Exception {
  WindowedWordCountITOptions options =
      TestPipeline.testingPipelineOptions().as(WindowedWordCountITOptions.class);
  options.setInputFile(DEFAULT_INPUT);
  options.setTestTimeoutSeconds(1200L);

  options.setMinTimestampMillis(0L);
  options.setMinTimestampMillis(Duration.standardHours(1).getMillis());
  options.setWindowSize(10);

  options.setOutput(
      FileSystems.matchNewResource(options.getTempRoot(), true)
          .resolve(
              String.format(
                  "WindowedWordCountIT.%s-%tFT%<tH:%<tM:%<tS.%<tL+%s",
                  testName.getMethodName(), new Date(), ThreadLocalRandom.current().nextInt()),
              StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("output", StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("results", StandardResolveOptions.RESOLVE_FILE)
          .toString());
  return options;
}
 
Example 17
Source Project: beam   Source File: TfIdfIT.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testE2ETfIdf() throws Exception {
  TfIdfITOptions options = TestPipeline.testingPipelineOptions().as(TfIdfITOptions.class);
  options.setInput(DEFAULT_INPUT);
  options.setOutput(
      FileSystems.matchNewResource(options.getTempRoot(), true)
          .resolve(
              String.format("TfIdfIT-%tF-%<tH-%<tM-%<tS-%<tL", new Date()),
              StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("output", StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("results", StandardResolveOptions.RESOLVE_FILE)
          .toString());
  TfIdf.runTfIdf(options);

  assertThat(
      new NumberedShardedFile(options.getOutput() + "*-of-*.csv", DEFAULT_SHARD_TEMPLATE),
      fileContentsHaveChecksum(EXPECTED_OUTPUT_CHECKSUM));
}
 
Example 18
Source Project: beam   Source File: IsmSinkTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteOutOfOrderKeysWithSameShardKeyIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x01}), EMPTY)));

  expectedException.expect(IllegalArgumentException.class);
  expectedException.expectMessage("expects keys to be written in strictly increasing order");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00}), EMPTY)));
}
 
Example 19
Source Project: beam   Source File: IsmSinkTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteEmptyKeyWithValueLargerThanBlockSize() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          IsmRecordCoder.of(
              1, // We hash using only the window
              0, // There are no metadata records
              // We specifically use a coder that encodes to 0 bytes.
              ImmutableList.<Coder<?>>of(VoidCoder.of()),
              ByteArrayCoder.of()),
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(
          IsmRecord.of(
              Arrays.asList(new Object[] {null}), new byte[IsmSink.BLOCK_SIZE_BYTES * 2])));
  sinkWriter.close();
}
 
Example 20
Source Project: beam   Source File: FilePatternMatchingShardedFile.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Reads all the lines of all the files.
 *
 * <p>Not suitable for use except in testing of small data, since the data size may be far more
 * than can be reasonably processed serially, in-memory, by a single thread.
 */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
  List<String> allLines = Lists.newArrayList();
  int i = 1;
  for (Metadata file : files) {
    try (Reader reader =
        Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
      List<String> lines = CharStreams.readLines(reader);
      allLines.addAll(lines);
      LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
    }
    i++;
  }
  return allLines;
}
 
Example 21
Source Project: beam   Source File: PackageUtil.java    License: Apache License 2.0 6 votes vote down vote up
public static PackageAttributes forBytesToStage(
    byte[] bytes, String targetName, String stagingPath) {
  HashCode hashCode = Hashing.sha256().newHasher().putBytes(bytes).hash();
  long size = bytes.length;

  String target = Environments.createStagingFileName(new File(targetName), hashCode);

  String resourcePath =
      FileSystems.matchNewResource(stagingPath, true)
          .resolve(target, StandardResolveOptions.RESOLVE_FILE)
          .toString();
  DataflowPackage targetPackage = new DataflowPackage();
  targetPackage.setName(target);
  targetPackage.setLocation(resourcePath);

  return new AutoValue_PackageUtil_PackageAttributes(
      null, bytes, targetPackage, size, hashCode.toString());
}
 
Example 22
Source Project: beam   Source File: DataflowPipelineTranslatorTest.java    License: Apache License 2.0 6 votes vote down vote up
private static DataflowPipelineOptions buildPipelineOptions() throws IOException {
  GcsUtil mockGcsUtil = mock(GcsUtil.class);
  when(mockGcsUtil.expand(any(GcsPath.class)))
      .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
  when(mockGcsUtil.bucketAccessible(any(GcsPath.class))).thenReturn(true);

  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setGcpCredential(new TestCredential());
  options.setJobName("some-job-name");
  options.setProject("some-project");
  options.setRegion("some-region");
  options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString());
  options.setFilesToStage(new ArrayList<>());
  options.setDataflowClient(buildMockDataflow(new IsValidCreateRequest()));
  options.setGcsUtil(mockGcsUtil);

  // Enable the FileSystems API to know about gs:// URIs in this test.
  FileSystems.setDefaultPipelineOptions(options);

  return options;
}
 
Example 23
Source Project: beam   Source File: IsmReaderFactoryTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFactory() throws Exception {
  WindowedValueCoder<?> coder =
      WindowedValue.getFullCoder(
          IsmRecordCoder.of(
              1, 0, ImmutableList.<Coder<?>>of(StringUtf8Coder.of()), VarLongCoder.of()),
          GlobalWindow.Coder.INSTANCE);

  String tmpFile = tmpFolder.newFile().getPath();
  ResourceId tmpResourceId = FileSystems.matchSingleFileSpec(tmpFile).resourceId();
  @SuppressWarnings("rawtypes")
  IsmReader<?> ismReader =
      (IsmReader)
          new IsmReaderFactory()
              .create(
                  createSpecForFilename(tmpFile),
                  coder,
                  options,
                  executionContext,
                  operationContext);
  assertEquals(coder.getValueCoder(), ismReader.getCoder());
  assertEquals(tmpResourceId, ismReader.getResourceId());
}
 
Example 24
Source Project: beam   Source File: ArtifactRetrievalService.java    License: Apache License 2.0 6 votes vote down vote up
public static InputStream getArtifact(RunnerApi.ArtifactInformation artifact) throws IOException {
  switch (artifact.getTypeUrn()) {
    case FILE_ARTIFACT_URN:
      RunnerApi.ArtifactFilePayload payload =
          RunnerApi.ArtifactFilePayload.parseFrom(artifact.getTypePayload());
      return Channels.newInputStream(
          FileSystems.open(
              FileSystems.matchNewResource(payload.getPath(), false /* is directory */)));
    case EMBEDDED_ARTIFACT_URN:
      return RunnerApi.EmbeddedFilePayload.parseFrom(artifact.getTypePayload())
          .getData()
          .newInput();
    default:
      throw new UnsupportedOperationException(
          "Unexpected artifact type: " + artifact.getTypeUrn());
  }
}
 
Example 25
Source Project: beam   Source File: BigQueryIO.java    License: Apache License 2.0 6 votes vote down vote up
@Override
void cleanup(PassThroughThenCleanup.ContextContainer c) throws Exception {
  PipelineOptions options = c.getPipelineOptions();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  String jobUuid = c.getJobId();
  final String extractDestinationDir =
      resolveTempLocation(bqOptions.getTempLocation(), "BigQueryExtractTemp", jobUuid);
  final String executingProject = bqOptions.getProject();
  JobReference jobRef =
      new JobReference()
          .setProjectId(executingProject)
          .setJobId(getExtractJobId(createJobIdToken(bqOptions.getJobName(), jobUuid)));

  Job extractJob = getBigQueryServices().getJobService(bqOptions).getJob(jobRef);

  if (extractJob != null) {
    List<ResourceId> extractFiles =
        getExtractFilePaths(extractDestinationDir, extractJob);
    if (extractFiles != null && !extractFiles.isEmpty()) {
      FileSystems.delete(
          extractFiles, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
    }
  }
}
 
Example 26
Source Project: gcp-ingestion   Source File: HashClientInfo.java    License: Mozilla Public License 2.0 5 votes vote down vote up
@VisibleForTesting
byte[] readBytes(String uri) throws IOException {
  Metadata metadata = FileSystems.matchSingleFileSpec(uri);
  ReadableByteChannel inputChannel = FileSystems.open(metadata.resourceId());
  try (InputStream inputStream = Channels.newInputStream(inputChannel)) {
    byte[] key = new byte[32];
    int bytesRead = inputStream.read(key);
    if (bytesRead != 32) {
      throw new KeyLengthMismatchException(bytesRead);
    }
    return key;
  }
}
 
Example 27
Source Project: deployment-examples   Source File: FileUtils.java    License: MIT License 5 votes vote down vote up
public static String copyFileFromGCSToWorker(ExecutableFile execuableFile) throws Exception {

    ResourceId sourceFile =
        FileSystems.matchNewResource(execuableFile.getSourceGCSLocation(), false);
    ResourceId destinationFile =
        FileSystems.matchNewResource(execuableFile.getDestinationLocation(), false);
    try {
      LOG.info(
          String.format(
              "Moving File %s to %s ",
              execuableFile.getSourceGCSLocation(), execuableFile.getDestinationLocation()));
      Path path = Paths.get(execuableFile.getDestinationLocation());

      if (path.toFile().exists()) {
        LOG.warn(
            String.format(
                "Overwriting file %s, should only see this once per worker.",
                execuableFile.getDestinationLocation()));
      }
      copyFile(sourceFile, destinationFile);
      path.toFile().setExecutable(true);
      return path.toString();

    } catch (Exception ex) {
      LOG.error(String.format("Error moving file : %s ", execuableFile.fileName), ex);
      throw ex;
    }
  }
 
Example 28
Source Project: beam   Source File: AvroByteSinkTest.java    License: Apache License 2.0 5 votes vote down vote up
<T> void runTestWriteFile(List<T> elems, Coder<T> coder) throws Exception {
  File tmpFile = tmpFolder.newFile("file.avro");
  String filename = tmpFile.getPath();

  // Write the file.

  AvroByteSink<T> avroSink =
      new AvroByteSink<>(FileSystems.matchNewResource(filename, false), coder);
  List<Long> actualSizes = new ArrayList<>();
  try (Sink.SinkWriter<T> writer = avroSink.writer()) {
    for (T elem : elems) {
      actualSizes.add(writer.add(elem));
    }
  }

  // Read back the file.
  AvroByteReader<T> reader =
      new AvroByteReader<>(filename, 0L, Long.MAX_VALUE, coder, PipelineOptionsFactory.create());

  List<T> actual = readAllFromReader(reader);
  List<Long> expectedSizes = new ArrayList<>();

  for (T value : actual) {
    expectedSizes.add((long) CoderUtils.encodeToByteArray(coder, value).length);
  }

  // Compare the expected and the actual elements.
  Assert.assertEquals(elems, actual);
  Assert.assertEquals(expectedSizes, actualSizes);
}
 
Example 29
/**
 * Helper to generate files for testing.
 *
 * @param filePath The path to the file to write.
 * @param fileContents The content to write.
 * @return The file written.
 * @throws IOException If an error occurs while creating or writing the file.
 */
private static ResourceId writeToFile(String filePath, String fileContents) throws IOException {

  ResourceId resourceId = FileSystems.matchNewResource(filePath, false);

  // Write the file contents to the channel and close.
  try (ReadableByteChannel readChannel =
      Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) {
    try (WritableByteChannel writeChannel = FileSystems.create(resourceId, MimeTypes.TEXT)) {
      ByteStreams.copy(readChannel, writeChannel);
    }
  }

  return resourceId;
}
 
Example 30
@ProcessElement
public void process(
    @Element Metadata dlqFile,
    OutputReceiver<String> outputs) throws IOException {

  // First we move the file to a temporary location so it will not be picked up
  // by the DLQ picker again.
  ResourceId newFileLocation = dlqFile.resourceId()
      .getCurrentDirectory()
      .resolve(TEMPORARY_HOLD_SUBDIRECTORY, StandardResolveOptions.RESOLVE_DIRECTORY)
      .resolve(dlqFile.resourceId().getFilename(), StandardResolveOptions.RESOLVE_FILE);

  LOG.info("Moving DLQ file {} to {}", dlqFile.resourceId().getFilename(), newFileLocation);
  // If this move has a failure, this means that the file has already been moved, thus we
  // ignore it.
  FileSystems.copy(
      Collections.singletonList(dlqFile.resourceId()),
      Collections.singletonList(newFileLocation),
      StandardMoveOptions.IGNORE_MISSING_FILES);

  InputStream jsonStream = Channels.newInputStream(FileSystems.open(newFileLocation));
  BufferedReader jsonReader = new BufferedReader(new InputStreamReader(jsonStream));

  // Assuming that files are JSONLines formatted.
  jsonReader.lines().forEach(outputs::output);
  this.filesToRemove.add(dlqFile.resourceId());
  this.filesToRemove.add(newFileLocation);
}