org.apache.beam.sdk.io.ReadableFileCoder Java Examples

The following examples show how to use org.apache.beam.sdk.io.ReadableFileCoder. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileShard.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public void encode(FileShard value, OutputStream os) throws IOException {
  StringUtf8Coder.of().encode(value.getTableName(), os);
  ReadableFileCoder.of().encode(value.getFile(), os);
  VarLongCoder.of().encode(value.getRange().getFrom(), os);
  VarLongCoder.of().encode(value.getRange().getTo(), os);
}
 
Example #2
Source File: FileShard.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public FileShard decode(InputStream is) throws IOException {
  String tableName = StringUtf8Coder.of().decode(is);
  ReadableFile file = ReadableFileCoder.of().decode(is);
  long from = VarLongCoder.of().decode(is);
  long to = VarLongCoder.of().decode(is);
  return new AutoValue_FileShard(tableName, file, new OffsetRange(from, to));
}
 
Example #3
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  S3ImportOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class);

  AWSOptionParser.formatOptions(options);

  Pipeline p = Pipeline.create(options);
  // s3
  PCollection<KV<String, ReadableFile>> s3Files =
      p.apply(
              "Poll S3 Files",
              FileIO.match()
                  .filepattern(options.getS3BucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add S3 File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  // gcs files
  PCollection<KV<String, ReadableFile>> gcsFiles =
      p.apply(
              "Poll GCS Files",
              FileIO.match()
                  .filepattern(options.getGcsBucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add GCS File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  PCollection<KV<String, ReadableFile>> files =
      PCollectionList.of(ImmutableList.of(gcsFiles, s3Files))
          .apply("File List", Flatten.pCollections())
          .apply(
              "Fixed Window",
              Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionTuple contents =
      files.apply(
          "Read File Contents",
          ParDo.of(new TextFileReader())
              .withOutputTags(
                  textReaderSuccessElements, TupleTagList.of(textReaderFailedElements)));

  PCollectionTuple inspectedContents =
      contents
          .get(textReaderSuccessElements)
          .apply(
              "DLP Inspection",
              ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName()))
                  .withOutputTags(
                      apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements)));

  inspectedContents
      .get(apiResponseSuccessElements)
      .apply(
          "BQ Write",
          BigQueryIO.<KV<String, TableRow>>write()
              .to(new BQDestination(options.getDataSetId(), options.getProject()))
              .withFormatFunction(
                  element -> {
                    return element.getValue();
                  })
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withoutValidation()
              .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  PCollectionList.of(
          ImmutableList.of(
              contents.get(textReaderFailedElements),
              inspectedContents.get(apiResponseFailedElements)))
      .apply("Combine Error Logs", Flatten.pCollections())
      .apply(
          "Write Error Logs",
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.error("***ERROR*** {}", c.element().toString());
                  c.output(c.element());
                }
              }));

  p.run();
}
 
Example #4
Source File: DLPTextToBigQueryStreamingTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests reading from a sample CSV file in chunks and create DLP Table from the contents and
 * process the contents by converting to Table Row.
 */
@Test
public void testFileIOToBigQueryStreamingE2E() throws IOException {
  ValueProvider<Integer> batchSize = p.newProvider(10);

  PCollectionView<List<KV<String, List<String>>>> headerMap =
      p.apply(Create.of(KV.of("tokenization_data", Arrays.asList(HEADER_ROW.split(",")))))
          .apply(View.asList());

  PCollection<KV<String, Table>> dlpTable =
      p.apply("Match", FileIO.match().filepattern(tokenizedFilePath))
          .apply("Read File", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply("Add Keys", WithKeys.of(key -> "tokenization_data"))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()))
          .apply(
              "Create DLP Table",
              ParDo.of(new CSVReader(batchSize, headerMap)).withSideInputs(headerMap));

  PAssert.that(dlpTable)
      .satisfies(
          collection -> {
            KV<String, Table> tableData = collection.iterator().next();
            assertThat(tableData.getKey(), is(equalTo("tokenization_data")));
            assertThat(tableData.getValue().getHeadersCount(), is(equalTo(11)));
            assertThat(tableData.getValue().getRowsCount(), is(equalTo(1)));
            return null;
          });

  PCollection<KV<String, TableRow>> tableRowMap =
      dlpTable.apply(ParDo.of(new TableRowProcessorDoFn()).withSideInputs(headerMap));

  PAssert.that(tableRowMap)
      .satisfies(
          collection -> {
            KV<String, TableRow> result = collection.iterator().next();

            assertThat(result.getValue().get("CardTypeCode"), is(equalTo("MC")));
            assertThat(result.getValue().get("CardTypeFullName"), is(equalTo("Master Card")));
            assertThat(result.getValue().get("IssuingBank"), is(equalTo("Wells Fargo")));
            assertThat(result.getValue().get("CardNumber"), is(equalTo("E5ssxfuqnGfF36Kk")));
            assertThat(result.getValue().get("CardHoldersName"), is(equalTo("Jeremy O Wilson")));
            assertThat(result.getValue().get("CVVCVV2"), is(equalTo("NK3")));
            assertThat(result.getValue().get("IssueDate"), is(equalTo("12/2007")));
            assertThat(result.getValue().get("ExpiryDate"), is(equalTo("12/2008")));
            assertThat(result.getValue().get("BillingDate"), is(equalTo("3")));
            assertThat(result.getValue().get("CardPIN"), is(equalTo("vmFF")));
            assertThat(result.getValue().get("CreditLimit"), is(equalTo("19800")));
            return null;
          });
  p.run();
}
 
Example #5
Source File: CoderRegistry.java    From beam with Apache License 2.0 4 votes vote down vote up
private CommonTypes() {
  ImmutableMap.Builder<Class<?>, CoderProvider> builder = ImmutableMap.builder();
  builder.put(
      Boolean.class, CoderProviders.fromStaticMethods(Boolean.class, BooleanCoder.class));
  builder.put(Byte.class, CoderProviders.fromStaticMethods(Byte.class, ByteCoder.class));
  builder.put(BitSet.class, CoderProviders.fromStaticMethods(BitSet.class, BitSetCoder.class));
  builder.put(Float.class, CoderProviders.fromStaticMethods(Float.class, FloatCoder.class));
  builder.put(Double.class, CoderProviders.fromStaticMethods(Double.class, DoubleCoder.class));
  builder.put(
      Instant.class, CoderProviders.fromStaticMethods(Instant.class, InstantCoder.class));
  builder.put(
      Integer.class, CoderProviders.fromStaticMethods(Integer.class, VarIntCoder.class));
  builder.put(
      Iterable.class, CoderProviders.fromStaticMethods(Iterable.class, IterableCoder.class));
  builder.put(KV.class, CoderProviders.fromStaticMethods(KV.class, KvCoder.class));
  builder.put(List.class, CoderProviders.fromStaticMethods(List.class, ListCoder.class));
  builder.put(Long.class, CoderProviders.fromStaticMethods(Long.class, VarLongCoder.class));
  builder.put(Map.class, CoderProviders.fromStaticMethods(Map.class, MapCoder.class));
  builder.put(
      Metadata.class, CoderProviders.fromStaticMethods(Metadata.class, MetadataCoder.class));
  builder.put(
      ResourceId.class,
      CoderProviders.fromStaticMethods(ResourceId.class, ResourceIdCoder.class));
  builder.put(
      FileIO.ReadableFile.class,
      CoderProviders.fromStaticMethods(FileIO.ReadableFile.class, ReadableFileCoder.class));
  builder.put(Set.class, CoderProviders.fromStaticMethods(Set.class, SetCoder.class));
  builder.put(
      String.class, CoderProviders.fromStaticMethods(String.class, StringUtf8Coder.class));
  builder.put(
      TimestampedValue.class,
      CoderProviders.fromStaticMethods(
          TimestampedValue.class, TimestampedValue.TimestampedValueCoder.class));
  builder.put(Void.class, CoderProviders.fromStaticMethods(Void.class, VoidCoder.class));
  builder.put(
      byte[].class, CoderProviders.fromStaticMethods(byte[].class, ByteArrayCoder.class));
  builder.put(
      IntervalWindow.class,
      CoderProviders.forCoder(
          TypeDescriptor.of(IntervalWindow.class), IntervalWindow.getCoder()));
  commonTypesToCoderProviders = builder.build();
}