Java Code Examples for org.apache.beam.sdk.io.FileIO.ReadableFile

The following examples show how to use org.apache.beam.sdk.io.FileIO.ReadableFile. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: dlp-dataflow-deidentification   Source File: S3Import.java    License: Apache License 2.0 6 votes vote down vote up
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, ReadableFile> file) throws IOException {
  long totalBytes = file.getValue().getMetadata().sizeBytes();
  long totalSplit = 0;
  if (totalBytes < BATCH_SIZE) {
    totalSplit = 2;
  } else {
    totalSplit = totalSplit + (totalBytes / BATCH_SIZE);
    long remaining = totalBytes % BATCH_SIZE;
    if (remaining > 0) {
      totalSplit = totalSplit + 2;
    }
  }

  LOG.debug(
      "Total Bytes {} for File {} -Initial Restriction range from 1 to: {}",
      totalBytes,
      file.getKey(),
      totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example 2
private static String getFileName(ReadableFile file) {
  String csvFileName = file.getMetadata().resourceId().getFilename().toString();
  /** taking out .csv extension from file name e.g fileName.csv->fileName */
  String[] fileKey = csvFileName.split("\\.", 2);

  if (!fileKey[1].equals(ALLOWED_FILE_EXTENSION) || !fileKey[0].matches(TABLE_REGEXP)) {
    throw new RuntimeException(
        "[Filename must contain a CSV extension "
            + " BQ table name must contain only letters, numbers, or underscores ["
            + fileKey[1]
            + "], ["
            + fileKey[0]
            + "]");
  }
  /** returning file name without extension */
  return fileKey[0];
}
 
Example 3
Source Project: DataflowTemplates   Source File: CsvConverters.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context, MultiOutputReceiver outputReceiver) {
  ReadableFile f = context.element();
  String headers;
  List<String> records = null;
  String delimiter = String.valueOf(this.csvFormat.getDelimiter());
  try {
    String csvFileString = f.readFullyAsUTF8String();
    StringReader reader = new StringReader(csvFileString);
    CSVParser parser = CSVParser.parse(reader, this.csvFormat.withFirstRecordAsHeader());
    records =
        parser.getRecords().stream()
            .map(i -> String.join(delimiter, i))
            .collect(Collectors.toList());
    headers = String.join(delimiter, parser.getHeaderNames());
  } catch (IOException ioe) {
    LOG.error("Headers do not match, consistency cannot be guaranteed");
    throw new RuntimeException("Could not read Csv headers: " + ioe.getMessage());
  }
  outputReceiver.get(this.headerTag).output(headers);
  records.forEach(r -> outputReceiver.get(this.linesTag).output(r));
}
 
Example 4
private static String getFileName(ReadableFile file) {
  String csvFileName = file.getMetadata().resourceId().getFilename().toString();
  /** taking out .csv extension from file name e.g fileName.csv->fileName */
  String[] fileKey = csvFileName.split("\\.", 2);

  if (!fileKey[1].equals(ALLOWED_FILE_EXTENSION) || !fileKey[0].matches(TABLE_REGEXP)) {
    throw new RuntimeException(
        "[Filename must contain a CSV extension "
            + " BQ table name must contain only letters, numbers, or underscores ["
            + fileKey[1]
            + "], ["
            + fileKey[0]
            + "]");
  }
  /** returning file name without extension */
  return fileKey[0];
}
 
Example 5
Source Project: DataflowTemplates   Source File: TextSourceTest.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  ReadableFile file = c.element();

  // Create a TextSource, passing null as the delimiter to use the default
  // delimiters ('\n', '\r', or '\r\n').
  TextSource textSource =
      new TextSource(file.getMetadata(), 0, file.getMetadata().sizeBytes(), null);
  String line;
  try {
    BoundedSource.BoundedReader<String> reader =
        textSource
            .createForSubrangeOfFile(file.getMetadata(), 0, file.getMetadata().sizeBytes())
            .createReader(c.getPipelineOptions());
    for (boolean more = reader.start(); more; more = reader.advance()) {
      c.output(reader.getCurrent());
    }
  } catch (IOException e) {
    throw new RuntimeException(
        "Unable to readFile: " + file.getMetadata().resourceId().toString());
  }
}
 
Example 6
Source Project: beam   Source File: ReadAllViaFileBasedSource.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void process(ProcessContext c) throws IOException {
  ReadableFile file = c.element().getKey();
  OffsetRange range = c.element().getValue();
  FileBasedSource<T> source =
      CompressedSource.from(createSource.apply(file.getMetadata().resourceId().toString()))
          .withCompression(file.getCompression());
  try (BoundedSource.BoundedReader<T> reader =
      source
          .createForSubrangeOfFile(file.getMetadata(), range.getFrom(), range.getTo())
          .createReader(c.getPipelineOptions())) {
    for (boolean more = reader.start(); more; more = reader.advance()) {
      c.output(reader.getCurrent());
    }
  }
}
 
Example 7
Source Project: beam   Source File: TFRecordIOTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testReadFilesNamed() {
  readPipeline.enableAbandonedNodeEnforcement(false);

  Metadata metadata =
      Metadata.builder()
          .setResourceId(FileSystems.matchNewResource("file", false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .build();
  Create.Values<ReadableFile> create = Create.of(new ReadableFile(metadata, Compression.AUTO));

  assertEquals(
      "TFRecordIO.ReadFiles/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output",
      readPipeline.apply(create).apply(TFRecordIO.readFiles()).getName());
  assertEquals(
      "MyRead/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output",
      readPipeline.apply(create).apply("MyRead", TFRecordIO.readFiles()).getName());
}
 
Example 8
Source Project: beam   Source File: TextSourceTest.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  ReadableFile file = c.element();
  ValueProvider<String> filenameProvider =
      ValueProvider.StaticValueProvider.of(file.getMetadata().resourceId().getFilename());
  // Create a TextSource, passing null as the delimiter to use the default
  // delimiters ('\n', '\r', or '\r\n').
  TextSource textSource = new TextSource(filenameProvider, null, null);
  try {
    BoundedSource.BoundedReader<String> reader =
        textSource
            .createForSubrangeOfFile(file.getMetadata(), 0, file.getMetadata().sizeBytes())
            .createReader(c.getPipelineOptions());
    for (boolean more = reader.start(); more; more = reader.advance()) {
      c.output(reader.getCurrent());
    }
  } catch (IOException e) {
    throw new RuntimeException(
        "Unable to readFile: " + file.getMetadata().resourceId().toString());
  }
}
 
Example 9
Source Project: dlp-dataflow-deidentification   Source File: S3Import.java    License: Apache License 2.0 5 votes vote down vote up
@SplitRestriction
public void splitRestriction(
    KV<String, ReadableFile> file, OffsetRange range, OutputReceiver<OffsetRange> out) {

  for (final OffsetRange p : range.split(1, 1)) {
    out.output(p);
  }
}
 
Example 10
/**
 * SDF needs to define a @GetInitialRestriction method that can create a restriction describing
 * the complete work for a given element. For our case this would be the total number of rows
 * for each CSV file. We will calculate the number of split required based on total number of
 * rows and batch size provided.
 *
 * @throws IOException
 */
//
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, ReadableFile> csvFile) throws IOException {

  int rowCount = 0;
  int totalSplit = 0;
  try (BufferedReader br = getReader(csvFile.getValue())) {
    /** assume first row is header */
    int checkRowCount = (int) br.lines().count() - 1;
    rowCount = (checkRowCount < 1) ? 1 : checkRowCount;
    totalSplit = rowCount / batchSize.get().intValue();
    int remaining = rowCount % batchSize.get().intValue();
    /**
     * Adjusting the total number of split based on remaining rows. For example: batch size of
     * 15 for 100 rows will have total 7 splits. As it's a range last split will have offset
     * range {7,8}
     */
    if (remaining > 0) {
      totalSplit = totalSplit + 2;

    } else {
      totalSplit = totalSplit + 1;
    }
  }

  LOG.debug("Initial Restriction range from 1 to: {}", totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example 11
/**
 * SDF needs to define a @SplitRestriction method that can split the intital restricton to a
 * number of smaller restrictions. For example: a intital rewstriction of (x, N) as input and
 * produces pairs (x, 0), (x, 1), …, (x, N-1) as output.
 */
@SplitRestriction
public void splitRestriction(
    KV<String, ReadableFile> csvFile, OffsetRange range, OutputReceiver<OffsetRange> out) {
  /** split the initial restriction by 1 */
  for (final OffsetRange p : range.split(1, 1)) {
    out.output(p);
  }
}
 
Example 12
/**
 * SDF needs to define a @GetInitialRestriction method that can create a restriction describing
 * the complete work for a given element. For our case this would be the total number of rows
 * for each CSV file. We will calculate the number of split required based on total number of
 * rows and batch size provided.
 *
 * @throws IOException
 */
@GetInitialRestriction
public OffsetRange getInitialRestriction(@Element KV<String, ReadableFile> csvFile) throws IOException {

  int rowCount = 0;
  int totalSplit = 0;
  try (BufferedReader br = getReader(csvFile.getValue())) {
    /** assume first row is header */
    int checkRowCount = (int) br.lines().count() - 1;
    rowCount = (checkRowCount < 1) ? 1 : checkRowCount;
    totalSplit = rowCount / batchSize.get().intValue();
    int remaining = rowCount % batchSize.get().intValue();
    /**
     * Adjusting the total number of split based on remaining rows. For example: batch size of
     * 15 for 100 rows will have total 7 splits. As it's a range last split will have offset
     * range {7,8}
     */
    if (remaining > 0) {
      totalSplit = totalSplit + 2;

    } else {
      totalSplit = totalSplit + 1;
    }
  }

  LOG.debug("Initial Restriction range from 1 to: {}", totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example 13
/**
 * SDF needs to define a @SplitRestriction method that can split the intital restricton to a
 * number of smaller restrictions. For example: a intital rewstriction of (x, N) as input and
 * produces pairs (x, 0), (x, 1), …, (x, N-1) as output.
 */
@SplitRestriction
public void splitRestriction(
    @Element KV<String, ReadableFile> csvFile,@Restriction OffsetRange range, OutputReceiver<OffsetRange> out) {
  /** split the initial restriction by 1 */
  for (final OffsetRange p : range.split(1, 1)) {
    out.output(p);
  }
}
 
Example 14
Source Project: DataflowTemplates   Source File: FileShard.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public FileShard decode(InputStream is) throws IOException {
  String tableName = StringUtf8Coder.of().decode(is);
  ReadableFile file = ReadableFileCoder.of().decode(is);
  long from = VarLongCoder.of().decode(is);
  long to = VarLongCoder.of().decode(is);
  return new AutoValue_FileShard(tableName, file, new OffsetRange(from, to));
}
 
Example 15
Source Project: beam   Source File: ReadAllViaFileBasedSource.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<ReadableFile> input) {
  return input
      .apply("Split into ranges", ParDo.of(new SplitIntoRangesFn(desiredBundleSizeBytes)))
      .apply("Reshuffle", Reshuffle.viaRandomKey())
      .apply("Read ranges", ParDo.of(new ReadFileRangesFn<>(createSource)))
      .setCoder(coder);
}
 
Example 16
Source Project: beam   Source File: XmlIO.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<ReadableFile> input) {
  return input.apply(
      new ReadAllViaFileBasedSource<>(
          64 * 1024L * 1024L,
          new CreateSourceFn<>(getConfiguration()),
          JAXBCoder.of(getConfiguration().getRecordClass())));
}
 
Example 17
Source Project: beam   Source File: TikaIO.java    License: Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}
 
Example 18
Source Project: nomulus   Source File: Transforms.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity} using
 * caller-provided {@code transformer}.
 */
static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>> processFiles(
    DoFn<ReadableFile, VersionedEntity> transformer) {
  return new PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>() {
    @Override
    public PCollection<VersionedEntity> expand(PCollection<Metadata> input) {
      return input
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(transformer.getClass().getSimpleName(), ParDo.of(transformer));
      // TODO(weiminyu): reshuffle to enable dynamic work rebalance per beam dev guide
    }
  };
}
 
Example 19
Source Project: nomulus   Source File: Transforms.java    License: Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(@Element ReadableFile file, OutputReceiver<VersionedEntity> out) {
  try {
    reader.apply(file).forEachRemaining(out::output);
  } catch (Exception e) {
    // Let the pipeline use default retry strategy on the whole file. For GCP Dataflow this
    // means retrying up to 4 times (may include other files grouped with this one), and failing
    // the pipeline if no success.
    throw new RuntimeException(e);
  }
}
 
Example 20
Source Project: dlp-dataflow-deidentification   Source File: S3Import.java    License: Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  S3ImportOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class);

  AWSOptionParser.formatOptions(options);

  Pipeline p = Pipeline.create(options);
  // s3
  PCollection<KV<String, ReadableFile>> s3Files =
      p.apply(
              "Poll S3 Files",
              FileIO.match()
                  .filepattern(options.getS3BucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add S3 File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  // gcs files
  PCollection<KV<String, ReadableFile>> gcsFiles =
      p.apply(
              "Poll GCS Files",
              FileIO.match()
                  .filepattern(options.getGcsBucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add GCS File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  PCollection<KV<String, ReadableFile>> files =
      PCollectionList.of(ImmutableList.of(gcsFiles, s3Files))
          .apply("File List", Flatten.pCollections())
          .apply(
              "Fixed Window",
              Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionTuple contents =
      files.apply(
          "Read File Contents",
          ParDo.of(new TextFileReader())
              .withOutputTags(
                  textReaderSuccessElements, TupleTagList.of(textReaderFailedElements)));

  PCollectionTuple inspectedContents =
      contents
          .get(textReaderSuccessElements)
          .apply(
              "DLP Inspection",
              ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName()))
                  .withOutputTags(
                      apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements)));

  inspectedContents
      .get(apiResponseSuccessElements)
      .apply(
          "BQ Write",
          BigQueryIO.<KV<String, TableRow>>write()
              .to(new BQDestination(options.getDataSetId(), options.getProject()))
              .withFormatFunction(
                  element -> {
                    return element.getValue();
                  })
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withoutValidation()
              .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  PCollectionList.of(
          ImmutableList.of(
              contents.get(textReaderFailedElements),
              inspectedContents.get(apiResponseFailedElements)))
      .apply("Combine Error Logs", Flatten.pCollections())
      .apply(
          "Write Error Logs",
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.error("***ERROR*** {}", c.element().toString());
                  c.output(c.element());
                }
              }));

  p.run();
}
 
Example 21
Source Project: dlp-dataflow-deidentification   Source File: S3Import.java    License: Apache License 2.0 4 votes vote down vote up
private static SeekableByteChannel getReader(ReadableFile eventFile) throws IOException {
  SeekableByteChannel channel = null;
  channel = eventFile.openSeekable();
  return channel;
}
 
Example 22
Source Project: gcp-ingestion   Source File: FailureMessage.java    License: Mozilla Public License 2.0 4 votes vote down vote up
/**
 * Return a PubsubMessage corresponding to an error reading from a file.
 */
public static PubsubMessage of(Object caller, ReadableFile readableFile, Throwable e) {
  Map<String, String> attributes = errorAttributes(caller, e);
  attributes.put("readable_file", readableFile.toString());
  return new PubsubMessage("{}".getBytes(StandardCharsets.UTF_8), attributes);
}
 
Example 23
Source Project: DataflowTemplates   Source File: FileShard.java    License: Apache License 2.0 4 votes vote down vote up
static FileShard create(String tableName, ReadableFile file, OffsetRange range) {
  Preconditions.checkNotNull(tableName);
  Preconditions.checkNotNull(file);
  Preconditions.checkNotNull(range);
  return new AutoValue_FileShard(tableName, file, range);
}
 
Example 24
Source Project: beam   Source File: TikaIO.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<ParseResult> expand(PCollection<ReadableFile> input) {
  return input.apply(ParDo.of(new ParseToStringFn(this)));
}
 
Example 25
Source Project: nomulus   Source File: Transforms.java    License: Apache License 2.0 4 votes vote down vote up
private BackupFileReader(ProcessFunction<ReadableFile, Iterator<VersionedEntity>> reader) {
  this.reader = reader;
}
 
Example 26
Source Project: DataflowTemplates   Source File: FileShard.java    License: Apache License 2.0 votes vote down vote up
abstract ReadableFile getFile();