Java Code Examples for org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getTotalSize()

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getTotalSize() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestParquetWriterAppendBlocks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void assertColumnsEquivalent(List<ColumnChunkMetaData> expected,
                                    List<ColumnChunkMetaData> actual) {
  Assert.assertEquals("Should have the expected columns",
      expected.size(), actual.size());
  for (int i = 0; i < actual.size(); i += 1) {
    ColumnChunkMetaData current = actual.get(i);
    if (i != 0) {
      ColumnChunkMetaData previous = actual.get(i - 1);
      long expectedStart = previous.getStartingPos() + previous.getTotalSize();
      Assert.assertEquals("Should start after the previous column",
          expectedStart, current.getStartingPos());
    }

    assertColumnMetadataEquivalent(expected.get(i), current);
  }
}
 
Example 2
Source File: PageReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
PageReader(org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentStatus, FileSystem fs, Path path, ColumnChunkMetaData columnChunkMetaData)
  throws ExecutionSetupException {
  this.parentColumnReader = parentStatus;
  allocatedDictionaryBuffers = new ArrayList<ByteBuf>();
  codecFactory = parentColumnReader.parentReader.getCodecFactory();
  this.stats = parentColumnReader.parentReader.parquetReaderStats;
  this.fileName = path.toString();
  debugName = new StringBuilder()
     .append(this.parentColumnReader.parentReader.getFragmentContext().getFragIdString())
     .append(":")
     .append(this.parentColumnReader.parentReader.getOperatorContext().getStats().getId() )
     .append(this.parentColumnReader.columnChunkMetaData.toString() )
     .toString();
  try {
    inputStream  = fs.open(path);
    BufferAllocator allocator =  parentColumnReader.parentReader.getOperatorContext().getAllocator();
    columnChunkMetaData.getTotalUncompressedSize();
    useBufferedReader  = parentColumnReader.parentReader.useBufferedReader;
    scanBufferSize = parentColumnReader.parentReader.bufferedReadSize;
    useFadvise = parentColumnReader.parentReader.useFadvise;
    enforceTotalSize = parentColumnReader.parentReader.enforceTotalSize;
    if (useBufferedReader) {
      this.dataReader = new BufferedDirectBufInputStream(inputStream, allocator, path.getName(),
          columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), scanBufferSize,
          enforceTotalSize, useFadvise);
    } else {
      this.dataReader = new DirectBufInputStream(inputStream, allocator, path.getName(),
          columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), enforceTotalSize,
          useFadvise);
    }
  } catch (IOException e) {
    throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
        + path.getName(), e);
  }

}
 
Example 3
Source File: ColumnChunkIncReadStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, FSDataInputStream in) throws IOException {
  this.metaData = metaData;
  this.columnDescriptor = columnDescriptor;
  this.size = metaData.getTotalSize();
  this.fileOffset = metaData.getStartingPos();
  this.in = in;
  this.decompressor = codecFactory.getDecompressor(metaData.getCodec());
}
 
Example 4
Source File: ColumnChunkIncReadStore.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, BulkInputStream in) throws IOException {
  this.metaData = metaData;
  this.columnDescriptor = columnDescriptor;
  this.size = metaData.getTotalSize();
  this.fileOffset = metaData.getStartingPos();
  this.in = in;
  this.decompressor = codecFactory.getDecompressor(metaData.getCodec());
}
 
Example 5
Source File: PageReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
PageReader(ColumnReader<?> parentStatus, SeekableInputStream inputStream, Path path, ColumnChunkMetaData columnChunkMetaData) throws ExecutionSetupException {
  this.parentColumnReader = parentStatus;
  allocatedDictionaryBuffers = new ArrayList<>();
  codecFactory = parentColumnReader.parentReader.getCodecFactory();
  this.stats = parentColumnReader.parentReader.parquetReaderStats;
  long start = columnChunkMetaData.getFirstDataPageOffset();
  this.inputStream = inputStream;
  try {
    this.dataReader = new ColumnDataReader(inputStream, start, columnChunkMetaData.getTotalSize());
    loadDictionaryIfExists(parentStatus, columnChunkMetaData, inputStream);
  } catch (IOException e) {
    throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
      + path.getName(), e);
  }
}
 
Example 6
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) {
  long doff = meta.getDictionaryPageOffset();
  long foff = meta.getFirstDataPageOffset();
  long tsize = meta.getTotalSize();
  long usize = meta.getTotalUncompressedSize();
  long count = meta.getValueCount();
  double ratio = usize / (double)tsize;
  String encodings = Joiner.on(',').skipNulls().join(meta.getEncodings());

  if (name) {
    String path = Joiner.on('.').skipNulls().join(meta.getPath());
    out.format("%s: ", path);
  }

  out.format(" %s", meta.getType());
  out.format(" %s", meta.getCodec());
  out.format(" DO:%d", doff);
  out.format(" FPO:%d", foff);
  out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio);
  out.format(" VC:%d", count);
  if (!encodings.isEmpty()) out.format(" ENC:%s", encodings);
  Statistics<?> stats = meta.getStatistics();
  if (stats != null) {
    out.format(" ST:[%s]", stats);
  } else {
    out.format(" ST:[none]");
  }
  out.println();
}
 
Example 7
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) {
  long doff = meta.getDictionaryPageOffset();
  long foff = meta.getFirstDataPageOffset();
  long tsize = meta.getTotalSize();
  long usize = meta.getTotalUncompressedSize();
  long count = meta.getValueCount();
  double ratio = usize / (double)tsize;
  String encodings = Joiner.on(',').skipNulls().join(meta.getEncodings());

  if (name) {
    String path = Joiner.on('.').skipNulls().join(meta.getPath());
    out.format("%s: ", path);
  }

  out.format(" %s", meta.getType());
  out.format(" %s", meta.getCodec());
  out.format(" DO:%d", doff);
  out.format(" FPO:%d", foff);
  out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio);
  out.format(" VC:%d", count);
  if (!encodings.isEmpty()) out.format(" ENC:%s", encodings);
  Statistics<?> stats = meta.getStatistics();
  if (stats != null) {
    out.format(" ST:[%s]", stats);
  } else {
    out.format(" ST:[none]");
  }
  out.println();
}
 
Example 8
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : this.getRowGroups()) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }

  BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
  long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

  long[] rowGroupOffsets = new long[this.getRowGroupCount()];
  for (int i = 0; i < rowGroupOffsets.length; i++) {
    rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
  }

  return new ParquetInputSplit(
          fileStatus.getPath(),
          hdfsBlock.getOffset(),
          end,
          length,
          hdfsBlock.getHosts(),
          rowGroupOffsets
  );
}
 
Example 9
Source File: ParquetInputSplit.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static long end(List<BlockMetaData> blocks, String requestedSchema) {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : blocks) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }
  return length;
}
 
Example 10
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
  String[] path = column.getPath().toArray();
  PrimitiveType type = primitive(schema, path);
  Preconditions.checkNotNull(type);

  ColumnDescriptor desc = schema.getColumnDescription(path);
  long size = column.getTotalSize();
  long count = column.getValueCount();
  float perValue = ((float) size) / count;
  CompressionCodecName codec = column.getCodec();
  Set<Encoding> encodings = column.getEncodings();
  EncodingStats encodingStats = column.getEncodingStats();
  String encodingSummary = encodingStats == null ?
      encodingsAsString(encodings, desc) :
      encodingStatsAsString(encodingStats);
  Statistics stats = column.getStatistics();

  String name = column.getPath().toDotString();

  PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
  if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
        name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
        humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  } else {
    console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s",
        name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
        stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  }
}
 
Example 11
Source File: Metadata.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}
 
Example 12
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
    //rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
      columnChunk.file_path = block.getPath(); // they are in the same file for now
      columnChunk.meta_data = new ColumnMetaData(
          getType(columnMetaData.getType()),
          toFormatEncodings(columnMetaData.getEncodings()),
          Arrays.asList(columnMetaData.getPath().toArray()),
          toFormatCodec(columnMetaData.getCodec()),
          columnMetaData.getValueCount(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getTotalSize(),
          columnMetaData.getFirstDataPageOffset());
      if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
        columnChunk.meta_data.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
      }
      columnChunk.meta_data.setBloom_filter_offset(columnMetaData.getBloomFilterOffset());
      if (!columnMetaData.getStatistics().isEmpty()) {
        columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
      }
      if (columnMetaData.getEncodingStats() != null) {
        columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
      }
//      columnChunk.meta_data.index_page_offset = ;
//      columnChunk.meta_data.key_value_metadata = ; // nothing yet

      IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
      if (columnIndexRef != null) {
        columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
        columnChunk.setColumn_index_length(columnIndexRef.getLength());
      }
      IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
      if (offsetIndexRef != null) {
        columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
        columnChunk.setOffset_index_length(offsetIndexRef.getLength());
      }

      parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroups.add(rowGroup);
  }