org.apache.beam.sdk.io.Compression Java Examples

The following examples show how to use org.apache.beam.sdk.io.Compression. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TikaIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testParseAndParseFiles() throws IOException {
  Path root =
      Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent();

  List<ParseResult> expected =
      Arrays.asList(
          ParseResult.success(
              root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()),
          ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE));

  PCollection<ParseResult> parse =
      p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString()))
          .apply("FilterParse", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parse).containsInAnyOrder(expected);

  PCollection<ParseResult> parseFiles =
      p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(TikaIO.parseFiles())
          .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parseFiles).containsInAnyOrder(expected);
  p.run();
}
 
Example #2
Source File: SinkOptions.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Set all the derived fields of a {@link SinkOptions.Parsed} instance.
 */
static void enrichSinkOptions(Parsed options) {
  validateSinkOptions(options);
  options.setParsedWindowDuration(Time.parseDuration(options.getWindowDuration()));
  options.setParsedBqTriggeringFrequency(Time.parseDuration(options.getBqTriggeringFrequency()));
  options.setParsedErrorBqTriggeringFrequency(
      Time.parseDuration(options.getErrorBqTriggeringFrequency()));
  options.setDecompressInputPayloads(
      providerWithDefault(options.getDecompressInputPayloads(), true));
  options.setOutputTableRowFormat(
      providerWithDefault(options.getOutputTableRowFormat(), TableRowFormat.payload));
  options.setOutputPubsubCompression(
      providerWithDefault(options.getOutputPubsubCompression(), Compression.GZIP));
  options.setErrorOutputPubsubCompression(
      providerWithDefault(options.getErrorOutputPubsubCompression(), Compression.GZIP));
  options.setOutputNumShards(providerWithDefault(options.getOutputNumShards(), 100));
  options.setErrorOutputNumShards(providerWithDefault(options.getErrorOutputNumShards(), 100));
}
 
Example #3
Source File: TFRecordIOIT.java    From beam with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void setup() {
  FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions();
  datasetSize = options.getDatasetSize();
  expectedHash = options.getExpectedHash();
  numberOfTextLines = options.getNumberOfRecords();
  compressionType = Compression.valueOf(options.getCompressionType());
  filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
  bigQueryDataset = options.getBigQueryDataset();
  bigQueryTable = options.getBigQueryTable();
  settings =
      InfluxDBSettings.builder()
          .withHost(options.getInfluxHost())
          .withDatabase(options.getInfluxDatabase())
          .withMeasurement(options.getInfluxMeasurement())
          .get();
}
 
Example #4
Source File: TextIOIT.java    From beam with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void setup() {
  FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions();
  datasetSize = options.getDatasetSize();
  expectedHash = options.getExpectedHash();
  numberOfTextLines = options.getNumberOfRecords();
  compressionType = Compression.valueOf(options.getCompressionType());
  filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
  numShards = options.getNumberOfShards();
  bigQueryDataset = options.getBigQueryDataset();
  bigQueryTable = options.getBigQueryTable();
  gatherGcsPerformanceMetrics = options.getReportGcsPerformanceMetrics();
  settings =
      InfluxDBSettings.builder()
          .withHost(options.getInfluxHost())
          .withDatabase(options.getInfluxDatabase())
          .withMeasurement(options.getInfluxMeasurement())
          .get();
}
 
Example #5
Source File: TestUtils.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Helper to generate files for testing.
 *
 * @param filePath The path to the file to write.
 * @param lines The lines to write.
 * @param compression The compression type of the file.
 * @return The file written.
 * @throws IOException If an error occurs while creating or writing the file.
 */
public static ResourceId writeToFile(
    String filePath, List<String> lines, Compression compression) throws IOException {

  String fileContents = String.join(System.lineSeparator(), lines);

  ResourceId resourceId = FileSystems.matchNewResource(filePath, false);

  String mimeType =
      compression == Compression.UNCOMPRESSED ? MimeTypes.TEXT : MimeTypes.BINARY;

  // Write the file contents to the channel and close.
  try (ReadableByteChannel readChannel =
      Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) {
    try (WritableByteChannel writeChannel =
        compression.writeCompressed(FileSystems.create(resourceId, mimeType))) {
      ByteStreams.copy(readChannel, writeChannel);
    }
  }

  return resourceId;
}
 
Example #6
Source File: PubsubIntegrationTest.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Test(timeout = 30000)
public void canSendPubsubOutput() throws Exception {
  final List<String> inputLines = Lines.resources("testdata/pubsub-integration/input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setOutput(pipeline.newProvider(topicName.toString()));
  // We would normally use pipeline.newProvider instead of StaticValueProvider in tests,
  // but something about this configuration causes the pipeline to stall when CompressPayload
  // accesses a method on the underlying enum value when defined via pipeline.newProvider.
  sinkOptions.setOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED));

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(OutputType.pubsub.write(sinkOptions));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/truncated.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}
 
Example #7
Source File: BulkCompressorTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}
 
Example #8
Source File: BulkDecompressor.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context) {
  ResourceId inputFile = context.element().resourceId();

  // Output a record to the failure file if the file doesn't match a known compression.
  if (!Compression.AUTO.isCompressed(inputFile.toString())) {
    String errorMsg =
        String.format(UNCOMPRESSED_ERROR_MSG, inputFile.toString(), SUPPORTED_COMPRESSIONS);

    context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), errorMsg));
  } else {
    try {
      ResourceId outputFile = decompress(inputFile);
      context.output(outputFile.toString());
    } catch (IOException e) {
      LOG.error(e.getMessage());
      context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage()));
    }
  }
}
 
Example #9
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/** Constructor. */
public PubsubOutput(ValueProvider<String> topic, ValueProvider<Compression> compression,
    int maxCompressedBytes) {
  this.topic = topic;
  this.compression = compression;
  this.maxCompressedBytes = maxCompressedBytes;
}
 
Example #10
Source File: TextStreamingPipeline.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }
 
Example #11
Source File: CompressPayloadTest.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Test
public void testGzipCompress() {
  String text = StringUtils.repeat("Lorem ipsum dolor sit amet ", 100);
  byte[] compressedBytes = CompressPayload.compress(text.getBytes(StandardCharsets.UTF_8),
      Compression.GZIP);
  assertThat(ArrayUtils.toObject(compressedBytes), Matchers.arrayWithSize(68));
}
 
Example #12
Source File: CompressPayloadTest.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Test
public void testMaxCompressedBytes() {
  String text = StringUtils.repeat("Lorem ipsum dolor sit amet ", 100);
  int expectedCompressedSize = 68;
  CompressPayload transform = CompressPayload.of(StaticValueProvider.of(Compression.GZIP))
      .withMaxCompressedBytes(expectedCompressedSize - 1);
  PubsubMessage truncated = transform
      .compress(new PubsubMessage(text.getBytes(StandardCharsets.UTF_8), new HashMap<>()));
  assertThat(ArrayUtils.toObject(truncated.getPayload()), Matchers.arrayWithSize(50));
}
 
Example #13
Source File: PubsubIntegrationTest.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Test(timeout = 30000)
public void canSendPubsubErrorOutput() throws Exception {
  final List<String> inputLines = Lines
      .resources("testdata/pubsub-integration/error-input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setInput(pipeline.newProvider("test input"));
  sinkOptions.setJobName("test job name");
  sinkOptions.setErrorOutput(pipeline.newProvider(topicName.toString()));
  // We would normally use pipeline.newProvider instead of StaticValueProvider in tests,
  // but something about this configuration causes the pipeline to stall when CompressPayload
  // accesses a method on the underlying enum value when defined via pipeline.newProvider.
  sinkOptions.setErrorOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED));

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(ErrorOutputType.pubsub.write(sinkOptions));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/error-output.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}
 
Example #14
Source File: BulkDecompressor.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Decompresses the inputFile using the specified compression and outputs to the main output of
 * the {@link Decompress} doFn. Files output to the destination will be first written as temp
 * files with a "temp-" prefix within the output directory. If a file fails decompression, the
 * filename and the associated error will be output to the dead-letter.
 *
 * @param inputFile The inputFile to decompress.
 * @return A {@link ResourceId} which points to the resulting file from the decompression.
 */
private ResourceId decompress(ResourceId inputFile) throws IOException {
  // Remove the compressed extension from the file. Example: demo.txt.gz -> demo.txt
  String outputFilename = Files.getNameWithoutExtension(inputFile.toString());

  // Resolve the necessary resources to perform the transfer.
  ResourceId outputDir = FileSystems.matchNewResource(destinationLocation.get(), true);
  ResourceId outputFile =
      outputDir.resolve(outputFilename, StandardResolveOptions.RESOLVE_FILE);
  ResourceId tempFile =
      outputDir.resolve(Files.getFileExtension(inputFile.toString())
          + "-temp-" + outputFilename, StandardResolveOptions.RESOLVE_FILE);

  // Resolve the compression
  Compression compression = Compression.detect(inputFile.toString());

  // Perform the copy of the decompressed channel into the destination.
  try (ReadableByteChannel readerChannel =
      compression.readDecompressed(FileSystems.open(inputFile))) {
    try (WritableByteChannel writerChannel = FileSystems.create(tempFile, MimeTypes.TEXT)) {
      ByteStreams.copy(readerChannel, writerChannel);
    }

    // Rename the temp file to the output file.
    FileSystems.rename(
        ImmutableList.of(tempFile),
        ImmutableList.of(outputFile),
        MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
  } catch (IOException e) {
    String msg = e.getMessage();

    LOG.error("Error occurred during decompression of {}", inputFile.toString(), e);
    throw new IOException(sanitizeDecompressionErrorMsg(msg, inputFile, compression));
  }

  return outputFile;
}
 
Example #15
Source File: BulkCompressor.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context) {
  ResourceId inputFile = context.element().resourceId();
  Compression compression = compressionValue.get();

  // Add the compression extension to the output filename. Example: demo.txt -> demo.txt.gz
  String outputFilename = inputFile.getFilename() + compression.getSuggestedSuffix();

  // Resolve the necessary resources to perform the transfer
  ResourceId outputDir = FileSystems.matchNewResource(destinationLocation.get(), true);
  ResourceId outputFile =
      outputDir.resolve(outputFilename, StandardResolveOptions.RESOLVE_FILE);
  ResourceId tempFile =
      outputDir.resolve("temp-" + outputFilename, StandardResolveOptions.RESOLVE_FILE);

  // Perform the copy of the compressed channel to the destination.
  try (ReadableByteChannel readerChannel = FileSystems.open(inputFile)) {
    try (WritableByteChannel writerChannel =
        compression.writeCompressed(FileSystems.create(tempFile, MimeTypes.BINARY))) {

      // Execute the copy to the temporary file
      ByteStreams.copy(readerChannel, writerChannel);
    }

    // Rename the temporary file to the output file
    FileSystems.rename(ImmutableList.of(tempFile), ImmutableList.of(outputFile));

    // Output the path to the uncompressed file
    context.output(outputFile.toString());
  } catch (IOException e) {
    LOG.error("Error occurred during compression of {}", inputFile.toString(), e);
    context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage()));
  }
}
 
Example #16
Source File: BulkDecompressorTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void setupClass() throws IOException {
  Path tempFolderRootPath = tempFolder.getRoot().toPath();
  tempFolderOutputPath = tempFolder.newFolder("output").toPath();

  // Test files
  compressedFile =
      TestUtils.writeToFile(
          tempFolderRootPath
              .resolve(FILE_BASE_NAME + Compression.GZIP.getSuggestedSuffix())
              .toString(),
          FILE_CONTENT,
          Compression.GZIP);

  wrongCompressionExtFile =
      TestUtils.writeToFile(
          tempFolderRootPath
              .resolve(FILE_BASE_NAME + Compression.DEFLATE.getSuggestedSuffix())
              .toString(),
          FILE_CONTENT,
          Compression.BZIP2);

  uncompressedFile =
      TestUtils.writeToFile(
          tempFolderRootPath
              .resolve(FILE_BASE_NAME + Compression.BZIP2.getSuggestedSuffix())
              .toString(),
          FILE_CONTENT,
          Compression.UNCOMPRESSED);

  unknownCompressionFile =
      TestUtils.writeToFile(
          tempFolderRootPath.resolve(FILE_BASE_NAME).toString(),
          FILE_CONTENT,
          Compression.UNCOMPRESSED);
}
 
Example #17
Source File: JdbcAvroIO.java    From dbeam with Apache License 2.0 5 votes vote down vote up
JdbcAvroSink(
    ValueProvider<ResourceId> filenamePrefix,
    DynamicAvroDestinations<UserT, Void, String> dynamicDestinations,
    JdbcAvroArgs jdbcAvroArgs) {
  super(filenamePrefix, dynamicDestinations, Compression.UNCOMPRESSED);
  this.dynamicDestinations = dynamicDestinations;
  this.jdbcAvroArgs = jdbcAvroArgs;
}
 
Example #18
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<ParseResult> expand(PBegin input) {
  return input
      .apply(FileIO.match().filepattern(getFilepattern()))
      .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
      .apply(parseFiles());
}
 
Example #19
Source File: SnowflakeIO.java    From beam with Apache License 2.0 5 votes vote down vote up
private PCollection<String> writeFiles(PCollection<T> input, String stagingBucketDir) {

      PCollection<String> mappedUserData =
          input
              .apply(
                  MapElements.via(
                      new SimpleFunction<T, Object[]>() {
                        @Override
                        public Object[] apply(T element) {
                          return getUserDataMapper().mapRow(element);
                        }
                      }))
              .apply("Map Objects array to CSV lines", ParDo.of(new MapObjectsArrayToCsvFn()))
              .setCoder(StringUtf8Coder.of());

      WriteFilesResult filesResult =
          mappedUserData.apply(
              "Write files to specified location",
              FileIO.<String>write()
                  .via(TextIO.sink())
                  .to(stagingBucketDir)
                  .withPrefix(getFileNameTemplate())
                  .withSuffix(".csv")
                  .withCompression(Compression.GZIP));

      return (PCollection)
          filesResult
              .getPerDestinationOutputFilenames()
              .apply("Parse KV filenames to Strings", Values.<String>create());
    }
 
Example #20
Source File: Transforms.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity} using
 * caller-provided {@code transformer}.
 */
static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>> processFiles(
    DoFn<ReadableFile, VersionedEntity> transformer) {
  return new PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>() {
    @Override
    public PCollection<VersionedEntity> expand(PCollection<Metadata> input) {
      return input
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(transformer.getClass().getSimpleName(), ParDo.of(transformer));
      // TODO(weiminyu): reshuffle to enable dynamic work rebalance per beam dev guide
    }
  };
}
 
Example #21
Source File: BeamInputTransform.java    From hop with Apache License 2.0 5 votes vote down vote up
@Override public PCollection<HopRow> expand( PBegin input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamHop.init(transformPluginClasses, xpPluginClasses);

      // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)");

      TextIO.Read ioRead = TextIO.read()
        .from( inputLocation )
        .withCompression( Compression.UNCOMPRESSED )
        ;

      StringToHopFn stringToHopFn = new StringToHopFn( transformName, rowMetaJson, separator, transformPluginClasses, xpPluginClasses );

      PCollection<HopRow> output = input

        // We read a bunch of Strings, one per line basically
        //
        .apply( transformName + " READ FILE",  ioRead )

        // We need to transform these lines into Hop fields
        //
        .apply( transformName, ParDo.of( stringToHopFn ) );

      return output;

    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in beam input transform", e );
      throw new RuntimeException( "Error in beam input transform", e );
    }

  }
 
Example #22
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/** Public constructor. */
public FileOutput(ValueProvider<String> outputPrefix, OutputFileFormat format,
    Duration windowDuration, ValueProvider<Integer> numShards, Compression compression,
    InputType inputType) {
  this.outputPrefix = outputPrefix;
  this.format = format;
  this.windowDuration = windowDuration;
  this.numShards = numShards;
  this.compression = compression;
  this.inputType = inputType;
}
 
Example #23
Source File: UtilTest.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetReader() throws IOException {
  Path firstPath = tmpFolder.newFile("first").toPath();
  int firstSize = 37;
  Files.write(firstPath, new byte[firstSize]);

  ValueProvider<String> testValueProvider = null;
  PCollection<String> br =
      p.apply(FileIO.match().filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*"))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(
              ParDo.of(
                  new DoFn<FileIO.ReadableFile, String>() {
                    @ProcessElement
                    public void processElement(
                        @Element FileIO.ReadableFile f, OutputReceiver<String> out)
                        throws IOException {
                      out.output(
                          Util.getReader(
                                  false,
                                  "object_name",
                                  "bucket_name",
                                  f,
                                  "key_name",
                                  testValueProvider)
                              .readLine());
                    }
                  }));
  p.run();
  assertNotNull(br);
}
 
Example #24
Source File: BeamInputTransform.java    From kettle-beam with Apache License 2.0 5 votes vote down vote up
@Override public PCollection<KettleRow> expand( PBegin input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamKettle.init(stepPluginClasses, xpPluginClasses);

      // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)");

      TextIO.Read ioRead = TextIO.read()
        .from( inputLocation )
        .withCompression( Compression.UNCOMPRESSED )
        ;

      StringToKettleFn stringToKettleFn = new StringToKettleFn( stepname, rowMetaJson, separator, stepPluginClasses, xpPluginClasses );

      PCollection<KettleRow> output = input

        // We read a bunch of Strings, one per line basically
        //
        .apply( stepname + " READ FILE",  ioRead )

        // We need to transform these lines into Kettle fields
        //
        .apply( stepname, ParDo.of( stringToKettleFn ) );

      return output;

    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in beam input transform", e );
      throw new RuntimeException( "Error in beam input transform", e );
    }

  }
 
Example #25
Source File: CompressPayload.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@VisibleForTesting
static byte[] compress(byte[] payload, Compression compression) {
  ByteArrayOutputStream out = new ByteArrayOutputStream();
  // We use a try-with-resources statement to ensure everything gets closed appropriately.
  try (ReadableByteChannel inChannel = Channels.newChannel(new ByteArrayInputStream(payload));
      WritableByteChannel outChannel = compression.writeCompressed(Channels.newChannel(out))) {
    ByteStreams.copy(inChannel, outChannel);
  } catch (IOException e) {
    return payload;
  }
  return out.toByteArray();
}
 
Example #26
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/** Public constructor. */
public AvroOutput(ValueProvider<String> outputPrefix, Duration windowDuration,
    ValueProvider<Integer> numShards, Compression compression, InputType inputType,
    ValueProvider<String> schemasLocation) {
  this.outputPrefix = outputPrefix;
  this.windowDuration = windowDuration;
  this.numShards = numShards;
  this.compression = compression;
  this.inputType = inputType;
  this.schemasLocation = schemasLocation;
  this.pathTemplate = NestedValueProvider.of(outputPrefix, DynamicPathTemplate::new);
}
 
Example #27
Source File: SinkOptions.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
@Description("Compression format for --outputType=file")
@Default.Enum("GZIP")
Compression getOutputFileCompression();
 
Example #28
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  S3ImportOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class);

  AWSOptionParser.formatOptions(options);

  Pipeline p = Pipeline.create(options);
  // s3
  PCollection<KV<String, ReadableFile>> s3Files =
      p.apply(
              "Poll S3 Files",
              FileIO.match()
                  .filepattern(options.getS3BucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add S3 File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  // gcs files
  PCollection<KV<String, ReadableFile>> gcsFiles =
      p.apply(
              "Poll GCS Files",
              FileIO.match()
                  .filepattern(options.getGcsBucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add GCS File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  PCollection<KV<String, ReadableFile>> files =
      PCollectionList.of(ImmutableList.of(gcsFiles, s3Files))
          .apply("File List", Flatten.pCollections())
          .apply(
              "Fixed Window",
              Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionTuple contents =
      files.apply(
          "Read File Contents",
          ParDo.of(new TextFileReader())
              .withOutputTags(
                  textReaderSuccessElements, TupleTagList.of(textReaderFailedElements)));

  PCollectionTuple inspectedContents =
      contents
          .get(textReaderSuccessElements)
          .apply(
              "DLP Inspection",
              ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName()))
                  .withOutputTags(
                      apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements)));

  inspectedContents
      .get(apiResponseSuccessElements)
      .apply(
          "BQ Write",
          BigQueryIO.<KV<String, TableRow>>write()
              .to(new BQDestination(options.getDataSetId(), options.getProject()))
              .withFormatFunction(
                  element -> {
                    return element.getValue();
                  })
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withoutValidation()
              .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  PCollectionList.of(
          ImmutableList.of(
              contents.get(textReaderFailedElements),
              inspectedContents.get(apiResponseFailedElements)))
      .apply("Combine Error Logs", Flatten.pCollections())
      .apply(
          "Write Error Logs",
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.error("***ERROR*** {}", c.element().toString());
                  c.output(c.element());
                }
              }));

  p.run();
}
 
Example #29
Source File: CSVStreamingPipeline.java    From dlp-dataflow-deidentification with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("serial")
public static void doTokenization(TokenizePipelineOptions options) {
  Pipeline p = Pipeline.create(options);

  PCollection<KV<String, List<String>>> filesAndContents =
      p.apply(
              FileIO.match()
                  .filepattern(options.getInputFile())
                  .continuously(
                      Duration.standardSeconds(options.getPollingInterval()),
                      Watch.Growth.never()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(
              "FileHandler",
              ParDo.of(
                  new CSVReader(
                      options.getCsek(),
                      options.getCsekhash(),
                      options.getFileDecryptKeyName(),
                      options.getFileDecryptKey(),
                      options.as(GcpOptions.class).getProject(),
                      options.getBatchSize())));

  PCollection<KV<String, Table>> dlpTables =
      filesAndContents.apply(
          "ContentHandler", ParDo.of(new CSVContentProcessorDoFn(options.getBatchSize())));

  PCollection<Row> dlpRows =
      dlpTables
          .apply(
              "DoDLPTokenization",
              ParDo.of(
                  new DLPTokenizationDoFn(
                      options.as(GcpOptions.class).getProject(),
                      options.getDeidentifyTemplateName(),
                      options.getInspectTemplateName())))
          .apply(
              Window.<Row>into(FixedWindows.of(Duration.standardSeconds(options.getInterval())))
                  .triggering(
                      AfterProcessingTime.pastFirstElementInPane()
                          .plusDelayOf(Duration.standardMinutes(1)))
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.standardMinutes(1)));

  dlpRows.apply(
      "WriteToBQ",
      BigQueryIO.<Row>write()
          .to(new BQDestination(options.getDataset(), options.as(GcpOptions.class).getProject()))
          .withFormatFunction(new BQTableRowSF())
          .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  dlpRows
      .apply(
          MapElements.via(
              new SimpleFunction<Row, KV<String, Row>>() {
                @Override
                public KV<String, Row> apply(Row row) {
                  return KV.of(row.getTableId(), row);
                }
              }))
      .apply(GroupByKey.<String, Row>create())
      .apply(
          "WriteToGCS",
          FileIO.<String, KV<String, Iterable<Row>>>writeDynamic()
              .by(
                  (SerializableFunction<KV<String, Iterable<Row>>, String>)
                      row -> {
                        return row.getKey();
                      })
              .via(new CSVSink())
              .to(options.getOutputFile())
              .withDestinationCoder(StringUtf8Coder.of())
              .withNumShards(1)
              .withNaming(key -> FileIO.Write.defaultNaming(key, ".csv")));

  p.run();
}
 
Example #30
Source File: XmlIO.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Decompresses all input files using the specified compression type. */
public Read<T> withCompression(Compression compression) {
  return toBuilder().setCompression(compression).build();
}