Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#getTotalByteSize()

The following examples show how to use org.apache.parquet.hadoop.metadata.BlockMetaData#getTotalByteSize() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}
 
Example 2
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}
 
Example 3
Source File: SizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long size = 0;
  for (FileStatus fs : inputFileStatuses) {
    long fileSize = 0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
        fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
      }
    }
    if (options.hasOption('d')) {
      if (options.hasOption('p')) {
        out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize));
      }
      else {
        out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize);
      }
    }
  }

  if (options.hasOption('p')) {
    out.format("Total Size: %s", getPrettySize(size));
  }
  else {
    out.format("Total Size: %d bytes", size);
  }
  out.println();
}
 
Example 4
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : this.getRowGroups()) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }

  BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
  long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

  long[] rowGroupOffsets = new long[this.getRowGroupCount()];
  for (int i = 0; i < rowGroupOffsets.length; i++) {
    rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
  }

  return new ParquetInputSplit(
          fileStatus.getPath(),
          hdfsBlock.getOffset(),
          end,
          length,
          hdfsBlock.getHosts(),
          rowGroupOffsets
  );
}
 
Example 5
Source File: TestParquetWriterAppendBlocks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testMergedMetadata() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  ParquetMetadata combinedFooter = ParquetFileReader.readFooter(
      CONF, combinedFile, NO_FILTER);
  ParquetMetadata f1Footer = ParquetFileReader.readFooter(
      CONF, file1, NO_FILTER);
  ParquetMetadata f2Footer = ParquetFileReader.readFooter(
      CONF, file2, NO_FILTER);

  LinkedList<BlockMetaData> expectedRowGroups = new LinkedList<BlockMetaData>();
  expectedRowGroups.addAll(f1Footer.getBlocks());
  expectedRowGroups.addAll(f2Footer.getBlocks());

  Assert.assertEquals("Combined should have the right number of row groups",
      expectedRowGroups.size(),
      combinedFooter.getBlocks().size());

  long nextStart = 4;
  for (BlockMetaData rowGroup : combinedFooter.getBlocks()) {
    BlockMetaData expected = expectedRowGroups.removeFirst();
    Assert.assertEquals("Row count should match",
        expected.getRowCount(), rowGroup.getRowCount());
    Assert.assertEquals("Compressed size should match",
        expected.getCompressedSize(), rowGroup.getCompressedSize());
    Assert.assertEquals("Total size should match",
        expected.getTotalByteSize(), rowGroup.getTotalByteSize());
    Assert.assertEquals("Start pos should be at the last row group's end",
        nextStart, rowGroup.getStartingPos());
    assertColumnsEquivalent(expected.getColumns(), rowGroup.getColumns());
    nextStart = rowGroup.getStartingPos() + rowGroup.getTotalByteSize();
  }
}
 
Example 6
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
  long start = rowGroup.getStartingPos();
  long rowCount = rowGroup.getRowCount();
  long compressedSize = rowGroup.getCompressedSize();
  long uncompressedSize = rowGroup.getTotalByteSize();
  String filePath = rowGroup.getPath();

  console.info(String.format("\nRow group %d:  count: %d  %s records  start: %d  total: %s%s\n%s",
      index, rowCount,
      humanReadable(((float) compressedSize) / rowCount),
      start, humanReadable(compressedSize),
      filePath != null ? " path: " + filePath : "",
      new TextStringBuilder(80).appendPadding(80, '-')));

  int size = maxSize(Iterables.transform(rowGroup.getColumns(),
      new Function<ColumnChunkMetaData, String>() {
        @Override
        public String apply(@Nullable ColumnChunkMetaData input) {
          return input == null ? "" : input.getPath().toDotString();
        }
      }));

  console.info(String.format("%-" + size + "s  %-9s %-9s %-9s %-10s %-7s %s",
      "", "type", "encodings", "count", "avg size", "nulls", "min / max"));
  for (ColumnChunkMetaData column : rowGroup.getColumns()) {
    printColumnChunk(console, size, column, schema);
  }
}
 
Example 7
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
    //rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
      columnChunk.file_path = block.getPath(); // they are in the same file for now
      columnChunk.meta_data = new ColumnMetaData(
          getType(columnMetaData.getType()),
          toFormatEncodings(columnMetaData.getEncodings()),
          Arrays.asList(columnMetaData.getPath().toArray()),
          toFormatCodec(columnMetaData.getCodec()),
          columnMetaData.getValueCount(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getTotalSize(),
          columnMetaData.getFirstDataPageOffset());
      if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
        columnChunk.meta_data.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
      }
      columnChunk.meta_data.setBloom_filter_offset(columnMetaData.getBloomFilterOffset());
      if (!columnMetaData.getStatistics().isEmpty()) {
        columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
      }
      if (columnMetaData.getEncodingStats() != null) {
        columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
      }
//      columnChunk.meta_data.index_page_offset = ;
//      columnChunk.meta_data.key_value_metadata = ; // nothing yet

      IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
      if (columnIndexRef != null) {
        columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
        columnChunk.setColumn_index_length(columnIndexRef.getLength());
      }
      IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
      if (offsetIndexRef != null) {
        columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
        columnChunk.setOffset_index_length(offsetIndexRef.getLength());
      }

      parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroups.add(rowGroup);
  }