org.apache.beam.sdk.io.FileIO.ReadableFile Java Examples

The following examples show how to use org.apache.beam.sdk.io.FileIO.ReadableFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 6 votes vote down vote up
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, ReadableFile> file) throws IOException {
  long totalBytes = file.getValue().getMetadata().sizeBytes();
  long totalSplit = 0;
  if (totalBytes < BATCH_SIZE) {
    totalSplit = 2;
  } else {
    totalSplit = totalSplit + (totalBytes / BATCH_SIZE);
    long remaining = totalBytes % BATCH_SIZE;
    if (remaining > 0) {
      totalSplit = totalSplit + 2;
    }
  }

  LOG.debug(
      "Total Bytes {} for File {} -Initial Restriction range from 1 to: {}",
      totalBytes,
      file.getKey(),
      totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example #2
Source File: TextSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  ReadableFile file = c.element();
  ValueProvider<String> filenameProvider =
      ValueProvider.StaticValueProvider.of(file.getMetadata().resourceId().getFilename());
  // Create a TextSource, passing null as the delimiter to use the default
  // delimiters ('\n', '\r', or '\r\n').
  TextSource textSource = new TextSource(filenameProvider, null, null);
  try {
    BoundedSource.BoundedReader<String> reader =
        textSource
            .createForSubrangeOfFile(file.getMetadata(), 0, file.getMetadata().sizeBytes())
            .createReader(c.getPipelineOptions());
    for (boolean more = reader.start(); more; more = reader.advance()) {
      c.output(reader.getCurrent());
    }
  } catch (IOException e) {
    throw new RuntimeException(
        "Unable to readFile: " + file.getMetadata().resourceId().toString());
  }
}
 
Example #3
Source File: DLPTextToBigQueryStreaming.java    From dlp-dataflow-deidentification with Apache License 2.0 6 votes vote down vote up
private static String getFileName(ReadableFile file) {
  String csvFileName = file.getMetadata().resourceId().getFilename().toString();
  /** taking out .csv extension from file name e.g fileName.csv->fileName */
  String[] fileKey = csvFileName.split("\\.", 2);

  if (!fileKey[1].equals(ALLOWED_FILE_EXTENSION) || !fileKey[0].matches(TABLE_REGEXP)) {
    throw new RuntimeException(
        "[Filename must contain a CSV extension "
            + " BQ table name must contain only letters, numbers, or underscores ["
            + fileKey[1]
            + "], ["
            + fileKey[0]
            + "]");
  }
  /** returning file name without extension */
  return fileKey[0];
}
 
Example #4
Source File: TFRecordIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadFilesNamed() {
  readPipeline.enableAbandonedNodeEnforcement(false);

  Metadata metadata =
      Metadata.builder()
          .setResourceId(FileSystems.matchNewResource("file", false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .build();
  Create.Values<ReadableFile> create = Create.of(new ReadableFile(metadata, Compression.AUTO));

  assertEquals(
      "TFRecordIO.ReadFiles/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output",
      readPipeline.apply(create).apply(TFRecordIO.readFiles()).getName());
  assertEquals(
      "MyRead/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output",
      readPipeline.apply(create).apply("MyRead", TFRecordIO.readFiles()).getName());
}
 
Example #5
Source File: CsvConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context, MultiOutputReceiver outputReceiver) {
  ReadableFile f = context.element();
  String headers;
  List<String> records = null;
  String delimiter = String.valueOf(this.csvFormat.getDelimiter());
  try {
    String csvFileString = f.readFullyAsUTF8String();
    StringReader reader = new StringReader(csvFileString);
    CSVParser parser = CSVParser.parse(reader, this.csvFormat.withFirstRecordAsHeader());
    records =
        parser.getRecords().stream()
            .map(i -> String.join(delimiter, i))
            .collect(Collectors.toList());
    headers = String.join(delimiter, parser.getHeaderNames());
  } catch (IOException ioe) {
    LOG.error("Headers do not match, consistency cannot be guaranteed");
    throw new RuntimeException("Could not read Csv headers: " + ioe.getMessage());
  }
  outputReceiver.get(this.headerTag).output(headers);
  records.forEach(r -> outputReceiver.get(this.linesTag).output(r));
}
 
Example #6
Source File: ReadAllViaFileBasedSource.java    From beam with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void process(ProcessContext c) throws IOException {
  ReadableFile file = c.element().getKey();
  OffsetRange range = c.element().getValue();
  FileBasedSource<T> source =
      CompressedSource.from(createSource.apply(file.getMetadata().resourceId().toString()))
          .withCompression(file.getCompression());
  try (BoundedSource.BoundedReader<T> reader =
      source
          .createForSubrangeOfFile(file.getMetadata(), range.getFrom(), range.getTo())
          .createReader(c.getPipelineOptions())) {
    for (boolean more = reader.start(); more; more = reader.advance()) {
      c.output(reader.getCurrent());
    }
  }
}
 
Example #7
Source File: DLPTextToBigQueryStreaming.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
private static String getFileName(ReadableFile file) {
  String csvFileName = file.getMetadata().resourceId().getFilename().toString();
  /** taking out .csv extension from file name e.g fileName.csv->fileName */
  String[] fileKey = csvFileName.split("\\.", 2);

  if (!fileKey[1].equals(ALLOWED_FILE_EXTENSION) || !fileKey[0].matches(TABLE_REGEXP)) {
    throw new RuntimeException(
        "[Filename must contain a CSV extension "
            + " BQ table name must contain only letters, numbers, or underscores ["
            + fileKey[1]
            + "], ["
            + fileKey[0]
            + "]");
  }
  /** returning file name without extension */
  return fileKey[0];
}
 
Example #8
Source File: TextSourceTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  ReadableFile file = c.element();

  // Create a TextSource, passing null as the delimiter to use the default
  // delimiters ('\n', '\r', or '\r\n').
  TextSource textSource =
      new TextSource(file.getMetadata(), 0, file.getMetadata().sizeBytes(), null);
  String line;
  try {
    BoundedSource.BoundedReader<String> reader =
        textSource
            .createForSubrangeOfFile(file.getMetadata(), 0, file.getMetadata().sizeBytes())
            .createReader(c.getPipelineOptions());
    for (boolean more = reader.start(); more; more = reader.advance()) {
      c.output(reader.getCurrent());
    }
  } catch (IOException e) {
    throw new RuntimeException(
        "Unable to readFile: " + file.getMetadata().resourceId().toString());
  }
}
 
Example #9
Source File: Transforms.java    From nomulus with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(@Element ReadableFile file, OutputReceiver<VersionedEntity> out) {
  try {
    reader.apply(file).forEachRemaining(out::output);
  } catch (Exception e) {
    // Let the pipeline use default retry strategy on the whole file. For GCP Dataflow this
    // means retrying up to 4 times (may include other files grouped with this one), and failing
    // the pipeline if no success.
    throw new RuntimeException(e);
  }
}
 
Example #10
Source File: Transforms.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity} using
 * caller-provided {@code transformer}.
 */
static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>> processFiles(
    DoFn<ReadableFile, VersionedEntity> transformer) {
  return new PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>() {
    @Override
    public PCollection<VersionedEntity> expand(PCollection<Metadata> input) {
      return input
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(transformer.getClass().getSimpleName(), ParDo.of(transformer));
      // TODO(weiminyu): reshuffle to enable dynamic work rebalance per beam dev guide
    }
  };
}
 
Example #11
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}
 
Example #12
Source File: XmlIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<ReadableFile> input) {
  return input.apply(
      new ReadAllViaFileBasedSource<>(
          64 * 1024L * 1024L,
          new CreateSourceFn<>(getConfiguration()),
          JAXBCoder.of(getConfiguration().getRecordClass())));
}
 
Example #13
Source File: ReadAllViaFileBasedSource.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<ReadableFile> input) {
  return input
      .apply("Split into ranges", ParDo.of(new SplitIntoRangesFn(desiredBundleSizeBytes)))
      .apply("Reshuffle", Reshuffle.viaRandomKey())
      .apply("Read ranges", ParDo.of(new ReadFileRangesFn<>(createSource)))
      .setCoder(coder);
}
 
Example #14
Source File: FileShard.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public FileShard decode(InputStream is) throws IOException {
  String tableName = StringUtf8Coder.of().decode(is);
  ReadableFile file = ReadableFileCoder.of().decode(is);
  long from = VarLongCoder.of().decode(is);
  long to = VarLongCoder.of().decode(is);
  return new AutoValue_FileShard(tableName, file, new OffsetRange(from, to));
}
 
Example #15
Source File: DLPTextToBigQueryStreaming.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * SDF needs to define a @SplitRestriction method that can split the intital restricton to a
 * number of smaller restrictions. For example: a intital rewstriction of (x, N) as input and
 * produces pairs (x, 0), (x, 1), …, (x, N-1) as output.
 */
@SplitRestriction
public void splitRestriction(
    @Element KV<String, ReadableFile> csvFile,@Restriction OffsetRange range, OutputReceiver<OffsetRange> out) {
  /** split the initial restriction by 1 */
  for (final OffsetRange p : range.split(1, 1)) {
    out.output(p);
  }
}
 
Example #16
Source File: DLPTextToBigQueryStreaming.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * SDF needs to define a @GetInitialRestriction method that can create a restriction describing
 * the complete work for a given element. For our case this would be the total number of rows
 * for each CSV file. We will calculate the number of split required based on total number of
 * rows and batch size provided.
 *
 * @throws IOException
 */
@GetInitialRestriction
public OffsetRange getInitialRestriction(@Element KV<String, ReadableFile> csvFile) throws IOException {

  int rowCount = 0;
  int totalSplit = 0;
  try (BufferedReader br = getReader(csvFile.getValue())) {
    /** assume first row is header */
    int checkRowCount = (int) br.lines().count() - 1;
    rowCount = (checkRowCount < 1) ? 1 : checkRowCount;
    totalSplit = rowCount / batchSize.get().intValue();
    int remaining = rowCount % batchSize.get().intValue();
    /**
     * Adjusting the total number of split based on remaining rows. For example: batch size of
     * 15 for 100 rows will have total 7 splits. As it's a range last split will have offset
     * range {7,8}
     */
    if (remaining > 0) {
      totalSplit = totalSplit + 2;

    } else {
      totalSplit = totalSplit + 1;
    }
  }

  LOG.debug("Initial Restriction range from 1 to: {}", totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example #17
Source File: DLPTextToBigQueryStreaming.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
/**
 * SDF needs to define a @SplitRestriction method that can split the intital restricton to a
 * number of smaller restrictions. For example: a intital rewstriction of (x, N) as input and
 * produces pairs (x, 0), (x, 1), …, (x, N-1) as output.
 */
@SplitRestriction
public void splitRestriction(
    KV<String, ReadableFile> csvFile, OffsetRange range, OutputReceiver<OffsetRange> out) {
  /** split the initial restriction by 1 */
  for (final OffsetRange p : range.split(1, 1)) {
    out.output(p);
  }
}
 
Example #18
Source File: DLPTextToBigQueryStreaming.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
/**
 * SDF needs to define a @GetInitialRestriction method that can create a restriction describing
 * the complete work for a given element. For our case this would be the total number of rows
 * for each CSV file. We will calculate the number of split required based on total number of
 * rows and batch size provided.
 *
 * @throws IOException
 */
//
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, ReadableFile> csvFile) throws IOException {

  int rowCount = 0;
  int totalSplit = 0;
  try (BufferedReader br = getReader(csvFile.getValue())) {
    /** assume first row is header */
    int checkRowCount = (int) br.lines().count() - 1;
    rowCount = (checkRowCount < 1) ? 1 : checkRowCount;
    totalSplit = rowCount / batchSize.get().intValue();
    int remaining = rowCount % batchSize.get().intValue();
    /**
     * Adjusting the total number of split based on remaining rows. For example: batch size of
     * 15 for 100 rows will have total 7 splits. As it's a range last split will have offset
     * range {7,8}
     */
    if (remaining > 0) {
      totalSplit = totalSplit + 2;

    } else {
      totalSplit = totalSplit + 1;
    }
  }

  LOG.debug("Initial Restriction range from 1 to: {}", totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example #19
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
@SplitRestriction
public void splitRestriction(
    KV<String, ReadableFile> file, OffsetRange range, OutputReceiver<OffsetRange> out) {

  for (final OffsetRange p : range.split(1, 1)) {
    out.output(p);
  }
}
 
Example #20
Source File: FileShard.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
static FileShard create(String tableName, ReadableFile file, OffsetRange range) {
  Preconditions.checkNotNull(tableName);
  Preconditions.checkNotNull(file);
  Preconditions.checkNotNull(range);
  return new AutoValue_FileShard(tableName, file, range);
}
 
Example #21
Source File: FailureMessage.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
/**
 * Return a PubsubMessage corresponding to an error reading from a file.
 */
public static PubsubMessage of(Object caller, ReadableFile readableFile, Throwable e) {
  Map<String, String> attributes = errorAttributes(caller, e);
  attributes.put("readable_file", readableFile.toString());
  return new PubsubMessage("{}".getBytes(StandardCharsets.UTF_8), attributes);
}
 
Example #22
Source File: TikaIO.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<ParseResult> expand(PCollection<ReadableFile> input) {
  return input.apply(ParDo.of(new ParseToStringFn(this)));
}
 
Example #23
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 4 votes vote down vote up
private static SeekableByteChannel getReader(ReadableFile eventFile) throws IOException {
  SeekableByteChannel channel = null;
  channel = eventFile.openSeekable();
  return channel;
}
 
Example #24
Source File: Transforms.java    From nomulus with Apache License 2.0 4 votes vote down vote up
private BackupFileReader(ProcessFunction<ReadableFile, Iterator<VersionedEntity>> reader) {
  this.reader = reader;
}
 
Example #25
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  S3ImportOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class);

  AWSOptionParser.formatOptions(options);

  Pipeline p = Pipeline.create(options);
  // s3
  PCollection<KV<String, ReadableFile>> s3Files =
      p.apply(
              "Poll S3 Files",
              FileIO.match()
                  .filepattern(options.getS3BucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add S3 File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  // gcs files
  PCollection<KV<String, ReadableFile>> gcsFiles =
      p.apply(
              "Poll GCS Files",
              FileIO.match()
                  .filepattern(options.getGcsBucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add GCS File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  PCollection<KV<String, ReadableFile>> files =
      PCollectionList.of(ImmutableList.of(gcsFiles, s3Files))
          .apply("File List", Flatten.pCollections())
          .apply(
              "Fixed Window",
              Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionTuple contents =
      files.apply(
          "Read File Contents",
          ParDo.of(new TextFileReader())
              .withOutputTags(
                  textReaderSuccessElements, TupleTagList.of(textReaderFailedElements)));

  PCollectionTuple inspectedContents =
      contents
          .get(textReaderSuccessElements)
          .apply(
              "DLP Inspection",
              ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName()))
                  .withOutputTags(
                      apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements)));

  inspectedContents
      .get(apiResponseSuccessElements)
      .apply(
          "BQ Write",
          BigQueryIO.<KV<String, TableRow>>write()
              .to(new BQDestination(options.getDataSetId(), options.getProject()))
              .withFormatFunction(
                  element -> {
                    return element.getValue();
                  })
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withoutValidation()
              .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  PCollectionList.of(
          ImmutableList.of(
              contents.get(textReaderFailedElements),
              inspectedContents.get(apiResponseFailedElements)))
      .apply("Combine Error Logs", Flatten.pCollections())
      .apply(
          "Write Error Logs",
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.error("***ERROR*** {}", c.element().toString());
                  c.output(c.element());
                }
              }));

  p.run();
}
 
Example #26
Source File: FileShard.java    From DataflowTemplates with Apache License 2.0 votes vote down vote up
abstract ReadableFile getFile();