Java Code Examples for org.apache.beam.sdk.io.ReadableFileCoder

The following examples show how to use org.apache.beam.sdk.io.ReadableFileCoder. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: DataflowTemplates   Source File: FileShard.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void encode(FileShard value, OutputStream os) throws IOException {
  StringUtf8Coder.of().encode(value.getTableName(), os);
  ReadableFileCoder.of().encode(value.getFile(), os);
  VarLongCoder.of().encode(value.getRange().getFrom(), os);
  VarLongCoder.of().encode(value.getRange().getTo(), os);
}
 
Example 2
Source Project: DataflowTemplates   Source File: FileShard.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public FileShard decode(InputStream is) throws IOException {
  String tableName = StringUtf8Coder.of().decode(is);
  ReadableFile file = ReadableFileCoder.of().decode(is);
  long from = VarLongCoder.of().decode(is);
  long to = VarLongCoder.of().decode(is);
  return new AutoValue_FileShard(tableName, file, new OffsetRange(from, to));
}
 
Example 3
Source Project: dlp-dataflow-deidentification   Source File: S3Import.java    License: Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  S3ImportOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class);

  AWSOptionParser.formatOptions(options);

  Pipeline p = Pipeline.create(options);
  // s3
  PCollection<KV<String, ReadableFile>> s3Files =
      p.apply(
              "Poll S3 Files",
              FileIO.match()
                  .filepattern(options.getS3BucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add S3 File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  // gcs files
  PCollection<KV<String, ReadableFile>> gcsFiles =
      p.apply(
              "Poll GCS Files",
              FileIO.match()
                  .filepattern(options.getGcsBucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add GCS File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  PCollection<KV<String, ReadableFile>> files =
      PCollectionList.of(ImmutableList.of(gcsFiles, s3Files))
          .apply("File List", Flatten.pCollections())
          .apply(
              "Fixed Window",
              Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionTuple contents =
      files.apply(
          "Read File Contents",
          ParDo.of(new TextFileReader())
              .withOutputTags(
                  textReaderSuccessElements, TupleTagList.of(textReaderFailedElements)));

  PCollectionTuple inspectedContents =
      contents
          .get(textReaderSuccessElements)
          .apply(
              "DLP Inspection",
              ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName()))
                  .withOutputTags(
                      apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements)));

  inspectedContents
      .get(apiResponseSuccessElements)
      .apply(
          "BQ Write",
          BigQueryIO.<KV<String, TableRow>>write()
              .to(new BQDestination(options.getDataSetId(), options.getProject()))
              .withFormatFunction(
                  element -> {
                    return element.getValue();
                  })
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withoutValidation()
              .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  PCollectionList.of(
          ImmutableList.of(
              contents.get(textReaderFailedElements),
              inspectedContents.get(apiResponseFailedElements)))
      .apply("Combine Error Logs", Flatten.pCollections())
      .apply(
          "Write Error Logs",
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.error("***ERROR*** {}", c.element().toString());
                  c.output(c.element());
                }
              }));

  p.run();
}
 
Example 4
/**
 * Tests reading from a sample CSV file in chunks and create DLP Table from the contents and
 * process the contents by converting to Table Row.
 */
@Test
public void testFileIOToBigQueryStreamingE2E() throws IOException {
  ValueProvider<Integer> batchSize = p.newProvider(10);

  PCollectionView<List<KV<String, List<String>>>> headerMap =
      p.apply(Create.of(KV.of("tokenization_data", Arrays.asList(HEADER_ROW.split(",")))))
          .apply(View.asList());

  PCollection<KV<String, Table>> dlpTable =
      p.apply("Match", FileIO.match().filepattern(tokenizedFilePath))
          .apply("Read File", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply("Add Keys", WithKeys.of(key -> "tokenization_data"))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()))
          .apply(
              "Create DLP Table",
              ParDo.of(new CSVReader(batchSize, headerMap)).withSideInputs(headerMap));

  PAssert.that(dlpTable)
      .satisfies(
          collection -> {
            KV<String, Table> tableData = collection.iterator().next();
            assertThat(tableData.getKey(), is(equalTo("tokenization_data")));
            assertThat(tableData.getValue().getHeadersCount(), is(equalTo(11)));
            assertThat(tableData.getValue().getRowsCount(), is(equalTo(1)));
            return null;
          });

  PCollection<KV<String, TableRow>> tableRowMap =
      dlpTable.apply(ParDo.of(new TableRowProcessorDoFn()).withSideInputs(headerMap));

  PAssert.that(tableRowMap)
      .satisfies(
          collection -> {
            KV<String, TableRow> result = collection.iterator().next();

            assertThat(result.getValue().get("CardTypeCode"), is(equalTo("MC")));
            assertThat(result.getValue().get("CardTypeFullName"), is(equalTo("Master Card")));
            assertThat(result.getValue().get("IssuingBank"), is(equalTo("Wells Fargo")));
            assertThat(result.getValue().get("CardNumber"), is(equalTo("E5ssxfuqnGfF36Kk")));
            assertThat(result.getValue().get("CardHoldersName"), is(equalTo("Jeremy O Wilson")));
            assertThat(result.getValue().get("CVVCVV2"), is(equalTo("NK3")));
            assertThat(result.getValue().get("IssueDate"), is(equalTo("12/2007")));
            assertThat(result.getValue().get("ExpiryDate"), is(equalTo("12/2008")));
            assertThat(result.getValue().get("BillingDate"), is(equalTo("3")));
            assertThat(result.getValue().get("CardPIN"), is(equalTo("vmFF")));
            assertThat(result.getValue().get("CreditLimit"), is(equalTo("19800")));
            return null;
          });
  p.run();
}
 
Example 5
Source Project: beam   Source File: CoderRegistry.java    License: Apache License 2.0 4 votes vote down vote up
private CommonTypes() {
  ImmutableMap.Builder<Class<?>, CoderProvider> builder = ImmutableMap.builder();
  builder.put(
      Boolean.class, CoderProviders.fromStaticMethods(Boolean.class, BooleanCoder.class));
  builder.put(Byte.class, CoderProviders.fromStaticMethods(Byte.class, ByteCoder.class));
  builder.put(BitSet.class, CoderProviders.fromStaticMethods(BitSet.class, BitSetCoder.class));
  builder.put(Float.class, CoderProviders.fromStaticMethods(Float.class, FloatCoder.class));
  builder.put(Double.class, CoderProviders.fromStaticMethods(Double.class, DoubleCoder.class));
  builder.put(
      Instant.class, CoderProviders.fromStaticMethods(Instant.class, InstantCoder.class));
  builder.put(
      Integer.class, CoderProviders.fromStaticMethods(Integer.class, VarIntCoder.class));
  builder.put(
      Iterable.class, CoderProviders.fromStaticMethods(Iterable.class, IterableCoder.class));
  builder.put(KV.class, CoderProviders.fromStaticMethods(KV.class, KvCoder.class));
  builder.put(List.class, CoderProviders.fromStaticMethods(List.class, ListCoder.class));
  builder.put(Long.class, CoderProviders.fromStaticMethods(Long.class, VarLongCoder.class));
  builder.put(Map.class, CoderProviders.fromStaticMethods(Map.class, MapCoder.class));
  builder.put(
      Metadata.class, CoderProviders.fromStaticMethods(Metadata.class, MetadataCoder.class));
  builder.put(
      ResourceId.class,
      CoderProviders.fromStaticMethods(ResourceId.class, ResourceIdCoder.class));
  builder.put(
      FileIO.ReadableFile.class,
      CoderProviders.fromStaticMethods(FileIO.ReadableFile.class, ReadableFileCoder.class));
  builder.put(Set.class, CoderProviders.fromStaticMethods(Set.class, SetCoder.class));
  builder.put(
      String.class, CoderProviders.fromStaticMethods(String.class, StringUtf8Coder.class));
  builder.put(
      TimestampedValue.class,
      CoderProviders.fromStaticMethods(
          TimestampedValue.class, TimestampedValue.TimestampedValueCoder.class));
  builder.put(Void.class, CoderProviders.fromStaticMethods(Void.class, VoidCoder.class));
  builder.put(
      byte[].class, CoderProviders.fromStaticMethods(byte[].class, ByteArrayCoder.class));
  builder.put(
      IntervalWindow.class,
      CoderProviders.forCoder(
          TypeDescriptor.of(IntervalWindow.class), IntervalWindow.getCoder()));
  commonTypesToCoderProviders = builder.build();
}