org.apache.parquet.hadoop.Footer Java Exaples

Source File: ParquetReader.java From tajo with Apache License 2.0

6 votes

private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}

Source File: ParquetReader.java From tajo with Apache License 2.0

6 votes

private void initReader() throws IOException {
  if (reader != null) {
    reader.close();
    reader = null;
  }
  if (footersIterator.hasNext()) {
    Footer footer = footersIterator.next();

    List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();

    MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();

    List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
        filter, blocks, fileSchema);

    reader = new InternalParquetRecordReader<T>(readSupport, filter);
    reader.initialize(footer.getParquetMetadata().getFileMetaData(),
        footer.getFile(), filteredBlocks, conf);
  }
}

Source File: ShowMetaCommand.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  boolean showOriginalTypes = options.hasOption('o');

  Configuration conf = new Configuration();
  Path inputPath = new Path(input);
  FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
  List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  for(Footer f: footers) {
    out.format("file: %s%n" , f.getFile());
    MetadataUtils.showDetails(out, f.getParquetMetadata(), showOriginalTypes);
    out.flushColumns();
  }
}

Source File: DeprecatedParquetInputFormat.java From parquet-mr with Apache License 2.0

6 votes

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  if (isTaskSideMetaData(job)) {
    return super.getSplits(job, numSplits);
  }

  List<Footer> footers = getFooters(job);
  List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
  if (splits == null) {
    return null;
  }
  InputSplit[] resultSplits = new InputSplit[splits.size()];
  int i = 0;
  for (ParquetInputSplit split : splits) {
    resultSplits[i++] = new ParquetInputSplitWrapper(split);
  }
  return resultSplits;
}

Source File: PentahoApacheInputFormat.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}

Source File: FooterGatherer.java From Bats with Apache License 2.0

5 votes

public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedCallable<Footer>> readers = new ArrayList<>();
  final List<Footer> foundFooters = new ArrayList<>();
  for (FileStatus status : statuses) {


    if (status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
        readers.add(new FooterReader(conf, inStatus));
      }
    } else {
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}

Source File: FooterGatherer.java From Bats with Apache License 2.0

5 votes

/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
  final FileSystem fs = status.getPath().getFileSystem(config);
  try(FSDataInputStream file = fs.open(status.getPath())) {

    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes);
    ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER);
    Footer footer = new Footer(status.getPath(), metadata);
    return footer;
  }
}

Source File: ParquetRecordReaderTest.java From dremio-oss with Apache License 2.0

5 votes

private void validateFooters(final List<Footer> metadata) {
  logger.debug(metadata.toString());
  assertEquals(3, metadata.size());
  for (Footer footer : metadata) {
    final File file = new File(footer.getFile().toUri());
    assertTrue(file.getName(), file.getName().startsWith("part"));
    assertTrue(file.getPath(), file.exists());
    final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
    assertEquals(2, parquetMetadata.getBlocks().size());
    final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
    assertEquals("bar", keyValueMetaData.get("foo"));
    assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
  }
}

Source File: RowCountCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long rowCount = 0;

  for (FileStatus fs : inputFileStatuses) {
    long fileRowCount=0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        rowCount += b.getRowCount();
        fileRowCount += b.getRowCount();
      }
    }
    if (options.hasOption('d')) {
      out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount);
    }
  }

  out.format("Total RowCount: %d", rowCount);
  out.println();
}

Source File: SizeCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long size = 0;
  for (FileStatus fs : inputFileStatuses) {
    long fileSize = 0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
        fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
      }
    }
    if (options.hasOption('d')) {
      if (options.hasOption('p')) {
        out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize));
      }
      else {
        out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize);
      }
    }
  }

  if (options.hasOption('p')) {
    out.format("Total Size: %s", getPrettySize(size));
  }
  else {
    out.format("Total Size: %d bytes", size);
  }
  out.println();
}

Source File: FooterGatherer.java From Bats with Apache License 2.0

4 votes

@Override
protected Footer runInner() throws Exception {
  return readFooter(conf, status);
}

Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0

4 votes

private List<Footer> getFooters(FlowProcess<JobConf> flowProcess, Hfs hfs) throws IOException {
  JobConf jobConf = flowProcess.getConfigCopy();
  DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
  format.addInputPath(jobConf, hfs.getPath());
  return format.getFooters(jobConf);
}

Source File: DeprecatedParquetInputFormat.java From parquet-mr with Apache License 2.0

4 votes

public List<Footer> getFooters(JobConf job) throws IOException {
  return realInputFormat.getFooters(job, asList(super.listStatus(job)));
}

Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0

4 votes

private List<Footer> getFooters(FlowProcess<? extends JobConf> flowProcess, Hfs hfs) throws IOException {
  JobConf jobConf = flowProcess.getConfigCopy();
  DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
  format.addInputPath(jobConf, hfs.getPath());
  return format.getFooters(jobConf);
}

Source File: ParquetAsTextInputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

public List<Footer> getFooters(JobConf job) throws IOException {
return realInputFormat.getFooters(job, Arrays.asList(super.listStatus(job)));
}

org.apache.parquet.hadoop.Footer Java Examples