org.apache.parquet.hadoop.Footer Java Examples

The following examples show how to use org.apache.parquet.hadoop.Footer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
Example #2
Source File: ParquetReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
private void initReader() throws IOException {
  if (reader != null) {
    reader.close();
    reader = null;
  }
  if (footersIterator.hasNext()) {
    Footer footer = footersIterator.next();

    List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();

    MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();

    List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
        filter, blocks, fileSchema);

    reader = new InternalParquetRecordReader<T>(readSupport, filter);
    reader.initialize(footer.getParquetMetadata().getFileMetaData(),
        footer.getFile(), filteredBlocks, conf);
  }
}
 
Example #3
Source File: ShowMetaCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  boolean showOriginalTypes = options.hasOption('o');

  Configuration conf = new Configuration();
  Path inputPath = new Path(input);
  FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
  List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  for(Footer f: footers) {
    out.format("file: %s%n" , f.getFile());
    MetadataUtils.showDetails(out, f.getParquetMetadata(), showOriginalTypes);
    out.flushColumns();
  }
}
 
Example #4
Source File: DeprecatedParquetInputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  if (isTaskSideMetaData(job)) {
    return super.getSplits(job, numSplits);
  }

  List<Footer> footers = getFooters(job);
  List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
  if (splits == null) {
    return null;
  }
  InputSplit[] resultSplits = new InputSplit[splits.size()];
  int i = 0;
  for (ParquetInputSplit split : splits) {
    resultSplits[i++] = new ParquetInputSplitWrapper(split);
  }
  return resultSplits;
}
 
Example #5
Source File: PentahoApacheInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}
 
Example #6
Source File: FooterGatherer.java    From Bats with Apache License 2.0 5 votes vote down vote up
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedCallable<Footer>> readers = new ArrayList<>();
  final List<Footer> foundFooters = new ArrayList<>();
  for (FileStatus status : statuses) {


    if (status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
        readers.add(new FooterReader(conf, inStatus));
      }
    } else {
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}
 
Example #7
Source File: FooterGatherer.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
  final FileSystem fs = status.getPath().getFileSystem(config);
  try(FSDataInputStream file = fs.open(status.getPath())) {

    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes);
    ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER);
    Footer footer = new Footer(status.getPath(), metadata);
    return footer;
  }
}
 
Example #8
Source File: ParquetRecordReaderTest.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void validateFooters(final List<Footer> metadata) {
  logger.debug(metadata.toString());
  assertEquals(3, metadata.size());
  for (Footer footer : metadata) {
    final File file = new File(footer.getFile().toUri());
    assertTrue(file.getName(), file.getName().startsWith("part"));
    assertTrue(file.getPath(), file.exists());
    final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
    assertEquals(2, parquetMetadata.getBlocks().size());
    final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
    assertEquals("bar", keyValueMetaData.get("foo"));
    assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
  }
}
 
Example #9
Source File: RowCountCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long rowCount = 0;

  for (FileStatus fs : inputFileStatuses) {
    long fileRowCount=0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        rowCount += b.getRowCount();
        fileRowCount += b.getRowCount();
      }
    }
    if (options.hasOption('d')) {
      out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount);
    }
  }

  out.format("Total RowCount: %d", rowCount);
  out.println();
}
 
Example #10
Source File: SizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long size = 0;
  for (FileStatus fs : inputFileStatuses) {
    long fileSize = 0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
        fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
      }
    }
    if (options.hasOption('d')) {
      if (options.hasOption('p')) {
        out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize));
      }
      else {
        out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize);
      }
    }
  }

  if (options.hasOption('p')) {
    out.format("Total Size: %s", getPrettySize(size));
  }
  else {
    out.format("Total Size: %d bytes", size);
  }
  out.println();
}
 
Example #11
Source File: FooterGatherer.java    From Bats with Apache License 2.0 4 votes vote down vote up
@Override
protected Footer runInner() throws Exception {
  return readFooter(conf, status);
}
 
Example #12
Source File: ParquetTupleScheme.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private List<Footer> getFooters(FlowProcess<JobConf> flowProcess, Hfs hfs) throws IOException {
  JobConf jobConf = flowProcess.getConfigCopy();
  DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
  format.addInputPath(jobConf, hfs.getPath());
  return format.getFooters(jobConf);
}
 
Example #13
Source File: DeprecatedParquetInputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public List<Footer> getFooters(JobConf job) throws IOException {
  return realInputFormat.getFooters(job, asList(super.listStatus(job)));
}
 
Example #14
Source File: ParquetTupleScheme.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private List<Footer> getFooters(FlowProcess<? extends JobConf> flowProcess, Hfs hfs) throws IOException {
  JobConf jobConf = flowProcess.getConfigCopy();
  DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
  format.addInputPath(jobConf, hfs.getPath());
  return format.getFooters(jobConf);
}
 
Example #15
Source File: ParquetAsTextInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public List<Footer> getFooters(JobConf job) throws IOException {
return realInputFormat.getFooters(job, Arrays.asList(super.listStatus(job)));
}