Java Code Examples for org.apache.parquet.hadoop.Footer

The following examples show how to use org.apache.parquet.hadoop.Footer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: tajo   Source File: ParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
Example 2
Source Project: tajo   Source File: ParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
private void initReader() throws IOException {
  if (reader != null) {
    reader.close();
    reader = null;
  }
  if (footersIterator.hasNext()) {
    Footer footer = footersIterator.next();

    List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();

    MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();

    List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
        filter, blocks, fileSchema);

    reader = new InternalParquetRecordReader<T>(readSupport, filter);
    reader.initialize(footer.getParquetMetadata().getFileMetaData(),
        footer.getFile(), filteredBlocks, conf);
  }
}
 
Example 3
Source Project: parquet-mr   Source File: ShowMetaCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  boolean showOriginalTypes = options.hasOption('o');

  Configuration conf = new Configuration();
  Path inputPath = new Path(input);
  FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
  List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  for(Footer f: footers) {
    out.format("file: %s%n" , f.getFile());
    MetadataUtils.showDetails(out, f.getParquetMetadata(), showOriginalTypes);
    out.flushColumns();
  }
}
 
Example 4
Source Project: parquet-mr   Source File: DeprecatedParquetInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  if (isTaskSideMetaData(job)) {
    return super.getSplits(job, numSplits);
  }

  List<Footer> footers = getFooters(job);
  List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
  if (splits == null) {
    return null;
  }
  InputSplit[] resultSplits = new InputSplit[splits.size()];
  int i = 0;
  for (ParquetInputSplit split : splits) {
    resultSplits[i++] = new ParquetInputSplitWrapper(split);
  }
  return resultSplits;
}
 
Example 5
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}
 
Example 6
Source Project: Bats   Source File: FooterGatherer.java    License: Apache License 2.0 5 votes vote down vote up
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedCallable<Footer>> readers = new ArrayList<>();
  final List<Footer> foundFooters = new ArrayList<>();
  for (FileStatus status : statuses) {


    if (status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
        readers.add(new FooterReader(conf, inStatus));
      }
    } else {
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}
 
Example 7
Source Project: Bats   Source File: FooterGatherer.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
  final FileSystem fs = status.getPath().getFileSystem(config);
  try(FSDataInputStream file = fs.open(status.getPath())) {

    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes);
    ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER);
    Footer footer = new Footer(status.getPath(), metadata);
    return footer;
  }
}
 
Example 8
Source Project: dremio-oss   Source File: ParquetRecordReaderTest.java    License: Apache License 2.0 5 votes vote down vote up
private void validateFooters(final List<Footer> metadata) {
  logger.debug(metadata.toString());
  assertEquals(3, metadata.size());
  for (Footer footer : metadata) {
    final File file = new File(footer.getFile().toUri());
    assertTrue(file.getName(), file.getName().startsWith("part"));
    assertTrue(file.getPath(), file.exists());
    final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
    assertEquals(2, parquetMetadata.getBlocks().size());
    final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
    assertEquals("bar", keyValueMetaData.get("foo"));
    assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
  }
}
 
Example 9
Source Project: parquet-mr   Source File: RowCountCommand.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long rowCount = 0;

  for (FileStatus fs : inputFileStatuses) {
    long fileRowCount=0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        rowCount += b.getRowCount();
        fileRowCount += b.getRowCount();
      }
    }
    if (options.hasOption('d')) {
      out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount);
    }
  }

  out.format("Total RowCount: %d", rowCount);
  out.println();
}
 
Example 10
Source Project: parquet-mr   Source File: SizeCommand.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long size = 0;
  for (FileStatus fs : inputFileStatuses) {
    long fileSize = 0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
        fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
      }
    }
    if (options.hasOption('d')) {
      if (options.hasOption('p')) {
        out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize));
      }
      else {
        out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize);
      }
    }
  }

  if (options.hasOption('p')) {
    out.format("Total Size: %s", getPrettySize(size));
  }
  else {
    out.format("Total Size: %d bytes", size);
  }
  out.println();
}
 
Example 11
Source Project: Bats   Source File: FooterGatherer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected Footer runInner() throws Exception {
  return readFooter(conf, status);
}
 
Example 12
Source Project: parquet-mr   Source File: ParquetTupleScheme.java    License: Apache License 2.0 4 votes vote down vote up
private List<Footer> getFooters(FlowProcess<JobConf> flowProcess, Hfs hfs) throws IOException {
  JobConf jobConf = flowProcess.getConfigCopy();
  DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
  format.addInputPath(jobConf, hfs.getPath());
  return format.getFooters(jobConf);
}
 
Example 13
Source Project: parquet-mr   Source File: DeprecatedParquetInputFormat.java    License: Apache License 2.0 4 votes vote down vote up
public List<Footer> getFooters(JobConf job) throws IOException {
  return realInputFormat.getFooters(job, asList(super.listStatus(job)));
}
 
Example 14
Source Project: parquet-mr   Source File: ParquetTupleScheme.java    License: Apache License 2.0 4 votes vote down vote up
private List<Footer> getFooters(FlowProcess<? extends JobConf> flowProcess, Hfs hfs) throws IOException {
  JobConf jobConf = flowProcess.getConfigCopy();
  DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
  format.addInputPath(jobConf, hfs.getPath());
  return format.getFooters(jobConf);
}
 
Example 15
public List<Footer> getFooters(JobConf job) throws IOException {
return realInputFormat.getFooters(job, Arrays.asList(super.listStatus(job)));
}