org.apache.beam.sdk.transforms.Watch Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.Watch. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TextToPubsubStream.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
    .apply(
      "Read Text Data",
      TextIO.read()
        .from(options.getInputFilePattern())
        .watchForNewFiles(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
    .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}
 
Example #2
Source File: FileIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<MatchResult.Metadata> expand(PCollection<String> input) {
  PCollection<MatchResult.Metadata> res;
  if (getConfiguration().getWatchInterval() == null) {
    res =
        input.apply(
            "Match filepatterns",
            ParDo.of(new MatchFn(getConfiguration().getEmptyMatchTreatment())));
  } else {
    res =
        input
            .apply(
                "Continuously match filepatterns",
                Watch.growthOf(
                        Contextful.of(new MatchPollFn(), Requirements.empty()),
                        new ExtractFilenameFn())
                    .withPollInterval(getConfiguration().getWatchInterval())
                    .withTerminationPerInput(getConfiguration().getWatchTerminationCondition()))
            .apply(Values.create());
  }
  return res.apply(Reshuffle.viaRandomKey());
}
 
Example #3
Source File: HCatalogIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public PCollection<HCatRecord> expand(PBegin input) {
  checkArgument(getTable() != null, "withTable() is required");
  checkArgument(getConfigProperties() != null, "withConfigProperties() is required");
  Watch.Growth<Read, Integer, Integer> growthFn;
  if (getPollingInterval() != null) {
    growthFn = Watch.growthOf(new PartitionPollerFn()).withPollInterval(getPollingInterval());
    if (getTerminationCondition() != null) {
      growthFn = growthFn.withTerminationPerInput(getTerminationCondition());
    }
    return input
        .apply("ConvertToReadRequest", Create.of(this))
        .apply("WatchForNewPartitions", growthFn)
        .apply("PartitionReader", ParDo.of(new PartitionReaderFn(getConfigProperties())));
  } else {
    // Treat as Bounded
    checkArgument(
        getTerminationCondition() == null,
        "withTerminationCondition() is not required when using in bounded reads mode");
    return input.apply(org.apache.beam.sdk.io.Read.from(new BoundedHCatalogSource(this)));
  }
}
 
Example #4
Source File: TextStreamingPipeline.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }
 
Example #5
Source File: FileIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public Watch.Growth.PollResult<MatchResult.Metadata> apply(String element, Context c)
    throws Exception {
  Instant now = Instant.now();
  return Watch.Growth.PollResult.incomplete(
          now, FileSystems.match(element, EmptyMatchTreatment.ALLOW).metadata())
      .withWatermark(now);
}
 
Example #6
Source File: TextIOReadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category({NeedsRunner.class, UsesUnboundedSplittableParDo.class})
public void testReadWatchForNewFiles() throws IOException, InterruptedException {
  final Path basePath = tempFolder.getRoot().toPath().resolve("readWatch");
  basePath.toFile().mkdir();

  p.apply(GenerateSequence.from(0).to(10).withRate(1, Duration.millis(100)))
      .apply(
          Window.<Long>into(FixedWindows.of(Duration.millis(150)))
              .withAllowedLateness(Duration.ZERO)
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .discardingFiredPanes())
      .apply(ToString.elements())
      .apply(
          TextIO.write()
              .to(basePath.resolve("data").toString())
              .withNumShards(1)
              .withWindowedWrites());

  PCollection<String> lines =
      p.apply(
          TextIO.read()
              .from(basePath.resolve("*").toString())
              .watchForNewFiles(
                  Duration.millis(100),
                  Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))));

  PAssert.that(lines).containsInAnyOrder("0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
  p.run();
}
 
Example #7
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  S3ImportOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class);

  AWSOptionParser.formatOptions(options);

  Pipeline p = Pipeline.create(options);
  // s3
  PCollection<KV<String, ReadableFile>> s3Files =
      p.apply(
              "Poll S3 Files",
              FileIO.match()
                  .filepattern(options.getS3BucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add S3 File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  // gcs files
  PCollection<KV<String, ReadableFile>> gcsFiles =
      p.apply(
              "Poll GCS Files",
              FileIO.match()
                  .filepattern(options.getGcsBucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add GCS File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  PCollection<KV<String, ReadableFile>> files =
      PCollectionList.of(ImmutableList.of(gcsFiles, s3Files))
          .apply("File List", Flatten.pCollections())
          .apply(
              "Fixed Window",
              Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionTuple contents =
      files.apply(
          "Read File Contents",
          ParDo.of(new TextFileReader())
              .withOutputTags(
                  textReaderSuccessElements, TupleTagList.of(textReaderFailedElements)));

  PCollectionTuple inspectedContents =
      contents
          .get(textReaderSuccessElements)
          .apply(
              "DLP Inspection",
              ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName()))
                  .withOutputTags(
                      apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements)));

  inspectedContents
      .get(apiResponseSuccessElements)
      .apply(
          "BQ Write",
          BigQueryIO.<KV<String, TableRow>>write()
              .to(new BQDestination(options.getDataSetId(), options.getProject()))
              .withFormatFunction(
                  element -> {
                    return element.getValue();
                  })
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withoutValidation()
              .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  PCollectionList.of(
          ImmutableList.of(
              contents.get(textReaderFailedElements),
              inspectedContents.get(apiResponseFailedElements)))
      .apply("Combine Error Logs", Flatten.pCollections())
      .apply(
          "Write Error Logs",
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.error("***ERROR*** {}", c.element().toString());
                  c.output(c.element());
                }
              }));

  p.run();
}
 
Example #8
Source File: CSVStreamingPipeline.java    From dlp-dataflow-deidentification with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("serial")
public static void doTokenization(TokenizePipelineOptions options) {
  Pipeline p = Pipeline.create(options);

  PCollection<KV<String, List<String>>> filesAndContents =
      p.apply(
              FileIO.match()
                  .filepattern(options.getInputFile())
                  .continuously(
                      Duration.standardSeconds(options.getPollingInterval()),
                      Watch.Growth.never()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(
              "FileHandler",
              ParDo.of(
                  new CSVReader(
                      options.getCsek(),
                      options.getCsekhash(),
                      options.getFileDecryptKeyName(),
                      options.getFileDecryptKey(),
                      options.as(GcpOptions.class).getProject(),
                      options.getBatchSize())));

  PCollection<KV<String, Table>> dlpTables =
      filesAndContents.apply(
          "ContentHandler", ParDo.of(new CSVContentProcessorDoFn(options.getBatchSize())));

  PCollection<Row> dlpRows =
      dlpTables
          .apply(
              "DoDLPTokenization",
              ParDo.of(
                  new DLPTokenizationDoFn(
                      options.as(GcpOptions.class).getProject(),
                      options.getDeidentifyTemplateName(),
                      options.getInspectTemplateName())))
          .apply(
              Window.<Row>into(FixedWindows.of(Duration.standardSeconds(options.getInterval())))
                  .triggering(
                      AfterProcessingTime.pastFirstElementInPane()
                          .plusDelayOf(Duration.standardMinutes(1)))
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.standardMinutes(1)));

  dlpRows.apply(
      "WriteToBQ",
      BigQueryIO.<Row>write()
          .to(new BQDestination(options.getDataset(), options.as(GcpOptions.class).getProject()))
          .withFormatFunction(new BQTableRowSF())
          .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  dlpRows
      .apply(
          MapElements.via(
              new SimpleFunction<Row, KV<String, Row>>() {
                @Override
                public KV<String, Row> apply(Row row) {
                  return KV.of(row.getTableId(), row);
                }
              }))
      .apply(GroupByKey.<String, Row>create())
      .apply(
          "WriteToGCS",
          FileIO.<String, KV<String, Iterable<Row>>>writeDynamic()
              .by(
                  (SerializableFunction<KV<String, Iterable<Row>>, String>)
                      row -> {
                        return row.getKey();
                      })
              .via(new CSVSink())
              .to(options.getOutputFile())
              .withDestinationCoder(StringUtf8Coder.of())
              .withNumShards(1)
              .withNaming(key -> FileIO.Write.defaultNaming(key, ".csv")));

  p.run();
}
 
Example #9
Source File: Snippets.java    From beam with Apache License 2.0 4 votes vote down vote up
public static void fileProcessPattern() throws Exception {
  Pipeline p = Pipeline.create();

  // [START FileProcessPatternProcessNewFilesSnip1]
  // This produces PCollection<MatchResult.Metadata>
  p.apply(
      FileIO.match()
          .filepattern("...")
          .continuously(
              Duration.standardSeconds(30),
              Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1))));
  // [END FileProcessPatternProcessNewFilesSnip1]

  // [START FileProcessPatternProcessNewFilesSnip2]
  // This produces PCollection<String>
  p.apply(
      TextIO.read()
          .from("<path-to-files>/*")
          .watchForNewFiles(
              // Check for new files every minute.
              Duration.standardMinutes(1),
              // Stop watching the file pattern if no new files appear for an hour.
              Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1))));
  // [END FileProcessPatternProcessNewFilesSnip2]

  // [START FileProcessPatternAccessMetadataSnip1]
  p.apply(FileIO.match().filepattern("hdfs://path/to/*.gz"))
      // The withCompression method is optional. By default, the Beam SDK detects compression from
      // the filename.
      .apply(FileIO.readMatches().withCompression(Compression.GZIP))
      .apply(
          ParDo.of(
              new DoFn<FileIO.ReadableFile, String>() {
                @ProcessElement
                public void process(@Element FileIO.ReadableFile file) {
                  // We can now access the file and its metadata.
                  LOG.info("File Metadata resourceId is {} ", file.getMetadata().resourceId());
                }
              }));
  // [END FileProcessPatternAccessMetadataSnip1]

}
 
Example #10
Source File: FileIOTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
@Category({NeedsRunner.class, UsesUnboundedSplittableParDo.class})
public void testMatchWatchForNewFiles() throws IOException, InterruptedException {
  // Write some files to a "source" directory.
  final Path sourcePath = tmpFolder.getRoot().toPath().resolve("source");
  sourcePath.toFile().mkdir();
  Files.write(sourcePath.resolve("first"), new byte[42]);
  Files.write(sourcePath.resolve("second"), new byte[37]);
  Files.write(sourcePath.resolve("third"), new byte[99]);

  // Create a "watch" directory that the pipeline will copy files into.
  final Path watchPath = tmpFolder.getRoot().toPath().resolve("watch");
  watchPath.toFile().mkdir();
  PCollection<MatchResult.Metadata> matchMetadata =
      p.apply(
          FileIO.match()
              .filepattern(watchPath.resolve("*").toString())
              .continuously(
                  Duration.millis(100),
                  Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))));
  PCollection<MatchResult.Metadata> matchAllMetadata =
      p.apply(Create.of(watchPath.resolve("*").toString()))
          .apply(
              FileIO.matchAll()
                  .continuously(
                      Duration.millis(100),
                      Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))));
  assertEquals(PCollection.IsBounded.UNBOUNDED, matchMetadata.isBounded());
  assertEquals(PCollection.IsBounded.UNBOUNDED, matchAllMetadata.isBounded());

  // Copy the files to the "watch" directory, preserving the lastModifiedTime;
  // the COPY_ATTRIBUTES option ensures that we will at a minimum copy lastModifiedTime.
  CopyOption[] copyOptions = {StandardCopyOption.COPY_ATTRIBUTES};
  Thread writer =
      new Thread(
          () -> {
            try {
              Thread.sleep(1000);
              Files.copy(sourcePath.resolve("first"), watchPath.resolve("first"), copyOptions);
              Thread.sleep(300);
              Files.copy(sourcePath.resolve("second"), watchPath.resolve("second"), copyOptions);
              Thread.sleep(300);
              Files.copy(sourcePath.resolve("third"), watchPath.resolve("third"), copyOptions);
            } catch (IOException | InterruptedException e) {
              throw new RuntimeException(e);
            }
          });
  writer.start();

  // We fetch lastModifiedTime from the files in the "source" directory to avoid a race condition
  // with the writer thread.
  List<MatchResult.Metadata> expected =
      Arrays.asList(
          metadata(
              watchPath.resolve("first"), 42, lastModifiedMillis(sourcePath.resolve("first"))),
          metadata(
              watchPath.resolve("second"), 37, lastModifiedMillis(sourcePath.resolve("second"))),
          metadata(
              watchPath.resolve("third"), 99, lastModifiedMillis(sourcePath.resolve("third"))));
  PAssert.that(matchMetadata).containsInAnyOrder(expected);
  PAssert.that(matchAllMetadata).containsInAnyOrder(expected);
  p.run();

  writer.join();
}
 
Example #11
Source File: HCatalogIOTest.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Perform end-to-end test of Write-then-Read operation. */
@Test
@NeedsEmptyTestTablesForUnboundedReads
public void testWriteThenUnboundedReadSuccess() throws Exception {

  defaultPipeline
      .apply(Create.of(buildHCatRecords(TEST_RECORDS_COUNT)))
      .apply(
          HCatalogIO.write()
              .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
              .withDatabase(TEST_DATABASE)
              .withTable(TEST_TABLE)
              .withPartition(getPartitions())
              .withBatchSize(512L));
  defaultPipeline.run();
  final ImmutableList<String> partitions = ImmutableList.of("load_date", "product_type");
  final PCollection<HCatRecord> data =
      readAfterWritePipeline
          .apply(
              "ReadData",
              HCatalogIO.read()
                  .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
                  .withDatabase(TEST_DATABASE)
                  .withPartitionCols(partitions)
                  .withTable(TEST_TABLE)
                  .withPollingInterval(Duration.millis(15000))
                  .withTerminationCondition(Watch.Growth.afterTotalOf(Duration.millis(60000))))
          .setCoder((Coder) WritableCoder.of(DefaultHCatRecord.class));

  final PCollection<String> output =
      data.apply(
          ParDo.of(
              new DoFn<HCatRecord, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(c.element().get(0).toString());
                }
              }));

  PAssert.that(output).containsInAnyOrder(getExpectedRecords(TEST_RECORDS_COUNT));
  readAfterWritePipeline.run();
}