org.apache.parquet.hadoop.metadata.ColumnChunkMetaData Java Examples

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnChunkMetaData. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestParquetWriterAppendBlocks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void assertColumnsEquivalent(List<ColumnChunkMetaData> expected,
                                    List<ColumnChunkMetaData> actual) {
  Assert.assertEquals("Should have the expected columns",
      expected.size(), actual.size());
  for (int i = 0; i < actual.size(); i += 1) {
    ColumnChunkMetaData current = actual.get(i);
    if (i != 0) {
      ColumnChunkMetaData previous = actual.get(i - 1);
      long expectedStart = previous.getStartingPos() + previous.getTotalSize();
      Assert.assertEquals("Should start after the previous column",
          expectedStart, current.getStartingPos());
    }

    assertColumnMetadataEquivalent(expected.get(i), current);
  }
}
 
Example #2
Source File: ColumnIndexFilterUtils.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
static List<OffsetRange> calculateOffsetRanges(OffsetIndex offsetIndex, ColumnChunkMetaData cm,
    long firstPageOffset) {
  List<OffsetRange> ranges = new ArrayList<>();
  int n = offsetIndex.getPageCount();
  if (n > 0) {
    OffsetRange currentRange = null;

    // Add a range for the dictionary page if required
    long rowGroupOffset = cm.getStartingPos();
    if (rowGroupOffset < firstPageOffset) {
      currentRange = new OffsetRange(rowGroupOffset, (int) (firstPageOffset - rowGroupOffset));
      ranges.add(currentRange);
    }

    for (int i = 0; i < n; ++i) {
      long offset = offsetIndex.getOffset(i);
      int length = offsetIndex.getCompressedPageSize(i);
      if (currentRange == null || !currentRange.extend(offset, length)) {
        currentRange = new OffsetRange(offset, length);
        ranges.add(currentRange);
      }
    }
  }
  return ranges;
}
 
Example #3
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
private void testDictionaryEncodedColumnsV1() throws Exception {
  Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList(
      "binary_field", "single_value_field", "optional_single_value_field", "int32_field", "int64_field",
      "double_field", "float_field", "int96_field"));
  for (ColumnChunkMetaData column : ccmd) {
    String name = column.getPath().toDotString();
    if (dictionaryEncodedColumns.contains(name)) {
      assertTrue("Column should be dictionary encoded: " + name,
          column.getEncodings().contains(Encoding.PLAIN_DICTIONARY));
      assertFalse("Column should not have plain data pages" + name,
          column.getEncodings().contains(Encoding.PLAIN));
    } else {
      assertTrue("Column should have plain encoding: " + name,
          column.getEncodings().contains(Encoding.PLAIN));
      if (name.startsWith("fallback")) {
        assertTrue("Column should have some dictionary encoding: " + name,
            column.getEncodings().contains(Encoding.PLAIN_DICTIONARY));
      } else {
        assertFalse("Column should have no dictionary encoding: " + name,
            column.getEncodings().contains(Encoding.PLAIN_DICTIONARY));
      }
    }
  }
}
 
Example #4
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visit(expr, this);
}
 
Example #5
Source File: PrintFooter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
Example #6
Source File: VarLengthValuesColumn.java    From Bats with Apache License 2.0 6 votes vote down vote up
VarLengthValuesColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                      ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
                      SchemaElement schemaElement) throws ExecutionSetupException {

  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  variableWidthVector = (VariableWidthVector) valueVec;

  if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
    usingDictionary = true;
    // We didn't implement the fixed length optimization when a Parquet Dictionary is used; as there are
    // no data point about this use-case. Will also enable bulk processing by default since early data
    // profiling (for detecting the best processing strategy to use) is disabled when the column precision
    // is already set.
    bulkReaderState.columnPrecInfo.columnPrecisionType = ColumnPrecisionType.DT_PRECISION_IS_VARIABLE;
    bulkReaderState.columnPrecInfo.bulkProcess         = true;
  }
  else {
    usingDictionary = false;
  }
}
 
Example #7
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}
 
Example #8
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void testDictionaryEncodedColumnsV2() throws Exception {
  Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList(
      "binary_field", "single_value_field", "optional_single_value_field", "fixed_field", "int32_field",
      "int64_field", "double_field", "float_field", "int96_field"));
  for (ColumnChunkMetaData column : ccmd) {
    EncodingStats encStats = column.getEncodingStats();
    String name = column.getPath().toDotString();
    if (dictionaryEncodedColumns.contains(name)) {
      assertTrue("Column should have dictionary pages: " + name, encStats.hasDictionaryPages());
      assertTrue("Column should have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      assertFalse("Column should not have non-dictionary encoded pages: " + name,
          encStats.hasNonDictionaryEncodedPages());
    } else {
      assertTrue("Column should have non-dictionary encoded pages: " + name,
          encStats.hasNonDictionaryEncodedPages());
      if (name.startsWith("fallback")) {
        assertTrue("Column should have dictionary pages: " + name, encStats.hasDictionaryPages());
        assertTrue("Column should have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      } else {
        assertFalse("Column should not have dictionary pages: " + name, encStats.hasDictionaryPages());
        assertFalse("Column should not have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      }
    }
  }
}
 
Example #9
Source File: ColumnIndexStoreImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnIndexStoreImpl(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) {
  // TODO[GS]: Offset index for every paths will be required; pre-read the consecutive ones at once?
  // TODO[GS]: Pre-read column index based on filter?
  this.reader = reader;
  Map<ColumnPath, IndexStore> store = new HashMap<>();
  for (ColumnChunkMetaData column : block.getColumns()) {
    ColumnPath path = column.getPath();
    if (paths.contains(path)) {
      store.put(path, new IndexStoreImpl(column));
    }
  }
  this.store = store;
}
 
Example #10
Source File: DictionaryFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
  Column<T> filterColumn = gt.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never greater than a
    // value. for all x, null is never > x.
    return BLOCK_CANNOT_MATCH;
  }

  // if the chunk has non-dictionary pages, don't bother decoding the
  // dictionary because the row group can't be eliminated.
  if (hasNonDictionaryPages(meta)) {
    return BLOCK_MIGHT_MATCH;
  }

  T value = gt.getValue();

  try {
    Set<T> dictSet = expandDictionary(meta);
    if (dictSet == null) {
      return BLOCK_MIGHT_MATCH;
    }

    Comparator<T> comparator = meta.getPrimitiveType().comparator();
    for (T entry : dictSet) {
      if (comparator.compare(value, entry) < 0) {
        return BLOCK_MIGHT_MATCH;
      }
    }

    return BLOCK_CANNOT_MATCH;
  } catch (IOException e) {
    LOG.warn("Failed to process dictionary for filter evaluation.", e);
  }

  return BLOCK_MIGHT_MATCH;
}
 
Example #11
Source File: VarLengthColumnReaders.java    From Bats with Apache License 2.0 5 votes vote down vote up
VarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarBinaryVector v,
                SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);

  this.varBinaryVector = v;
  this.mutator         = v.getMutator();
}
 
Example #12
Source File: BloomFilterImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public <T extends Comparable<T>> Boolean visit(Operators.Eq<T> eq) {
  T value = eq.getValue();

  if (value == null) {
    // the bloom filter bitset contains only non-null values so isn't helpful. this
    // could check the column stats, but the StatisticsFilter is responsible
    return BLOCK_MIGHT_MATCH;
  }

  Operators.Column<T> filterColumn = eq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
  if (meta == null) {
    // the column isn't in this file so all values are null, but the value
    // must be non-null because of the above check.
    return BLOCK_CANNOT_MATCH;
  }

  try {
    BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(meta);
    if (bloomFilter != null && !bloomFilter.findHash(bloomFilter.hash(value))) {
      return BLOCK_CANNOT_MATCH;
    }
  } catch (RuntimeException e) {
    LOG.warn(e.getMessage());
    return BLOCK_MIGHT_MATCH;
  }

  return BLOCK_MIGHT_MATCH;
}
 
Example #13
Source File: ParquetSchema.java    From Bats with Apache License 2.0 5 votes vote down vote up
Map<String, Integer> buildChunkMap(BlockMetaData rowGroupMetadata) {
  // the column chunk meta-data is not guaranteed to be in the same order as the columns in the schema
  // a map is constructed for fast access to the correct columnChunkMetadata to correspond
  // to an element in the schema
  Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>();

  int colChunkIndex = 0;
  for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) {
    columnChunkMetadataPositionsInList.put(Arrays.toString(colChunk.getPath().toArray()), colChunkIndex);
    colChunkIndex++;
  }
  return columnChunkMetadataPositionsInList;
}
 
Example #14
Source File: NullableFixedByteAlignedReaders.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
CorruptionDetectingNullableDateReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize,
    ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
    boolean fixedLength, DateMilliVector v, SchemaElement schemaElement)
        throws ExecutionSetupException {
  super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  dateVector = v;
}
 
Example #15
Source File: ColumnSizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}
 
Example #16
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private BlockMetaData newBlock(long start, long compressedBlockSize) {
  BlockMetaData blockMetaData = new BlockMetaData();
  long uncompressedSize = compressedBlockSize * 2;//assuming the compression ratio is 2
  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
                                                       PrimitiveTypeName.BINARY,
                                                       CompressionCodecName.GZIP,
                                                       new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
                                                       new BinaryStatistics(),
                                                       start, 0l, 0l, compressedBlockSize, uncompressedSize);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(uncompressedSize);
  return blockMetaData;
}
 
Example #17
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<Long> getOffsets(TransParquetFileReader reader, ColumnChunkMetaData chunk) throws IOException {
  List<Long> offsets = new ArrayList<>();
  reader.setStreamPosition(chunk.getStartingPos());
  long readValues = 0;
  long totalChunkValues = chunk.getValueCount();
  while (readValues < totalChunkValues) {
    long curOffset = reader.getPos();
    PageHeader pageHeader = reader.readPageHeader();
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader);
        break;
      case DATA_PAGE:
        DataPageHeader headerV1 = pageHeader.data_page_header;
        offsets.add(curOffset);
        compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader);
        readValues += headerV1.getNum_values();
        break;
      case DATA_PAGE_V2:
        DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
        offsets.add(curOffset);
        int rlLength = headerV2.getRepetition_levels_byte_length();
        compressionConverter.readBlock(rlLength, reader);
        int dlLength = headerV2.getDefinition_levels_byte_length();
        compressionConverter.readBlock(dlLength, reader);
        int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
        compressionConverter.readBlock(payLoadLength, reader);
        readValues += headerV2.getNum_values();
        break;
      default:
        throw new IOException("Not recognized page type");
    }
  }
  return offsets;
}
 
Example #18
Source File: ParquetDictionaryRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
  EncodingStats stats = meta.getEncodingStats();
  if (stats != null) {
    return stats.hasNonDictionaryEncodedPages();
  }

  // without EncodingStats, fall back to testing the encoding list
  Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
  if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
    // if remove returned true, PLAIN_DICTIONARY was present, which means at
    // least one page was dictionary encoded and 1.0 encodings are used

    // RLE and BIT_PACKED are only used for repetition or definition levels
    encodings.remove(Encoding.RLE);
    encodings.remove(Encoding.BIT_PACKED);

    if (encodings.isEmpty()) {
      return false; // no encodings other than dictionary or rep/def levels
    }

    return true;

  } else {
    // if PLAIN_DICTIONARY wasn't present, then either the column is not
    // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
    // for 2.0, this cannot determine whether a page fell back without
    // page encoding stats
    return true;
  }
}
 
Example #19
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showColumnChunkDetails(PrettyPrintWriter out, Map<String,Object> current, int depth) {
  for (Map.Entry<String,Object> entry : current.entrySet()) {
    String name = Strings.repeat(".", depth) + entry.getKey();
    Object value = entry.getValue();

    if (value instanceof Map) {
      out.println(name + ": ");
      showColumnChunkDetails(out, (Map<String,Object>)value, depth + 1);
    } else {
      out.print(name + ": ");
      showDetails(out, (ColumnChunkMetaData)value, false);
    }
  }
}
 
Example #20
Source File: ColumnReaderFactory.java    From Bats with Apache License 2.0 5 votes vote down vote up
static VarLengthValuesColumn<?> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                                        ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
                                        SchemaElement schemaElement
) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();
  switch (descriptor.getMaxDefinitionLevel()) {
    case 0:
      if (convertedType == null) {
        return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
      switch (convertedType) {
        case UTF8:
        case ENUM:
          return new VarLengthColumnReaders.VarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          if (v instanceof VarDecimalVector) {
            return new VarLengthColumnReaders.VarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarDecimalVector) v, schemaElement);
          }
        default:
          return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
    default:
      if (convertedType == null) {
        return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
      }

      switch (convertedType) {
        case UTF8:
        case ENUM:
          return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement);
        case DECIMAL:
          if (v instanceof NullableVarDecimalVector) {
            return new VarLengthColumnReaders.NullableVarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) v, schemaElement);
          }
        default:
          return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
      }
  }
}
 
Example #21
Source File: ColumnChunkIncReadStore.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, BulkInputStream in) throws IOException {
  this.metaData = metaData;
  this.columnDescriptor = columnDescriptor;
  this.size = metaData.getTotalSize();
  this.fileOffset = metaData.getStartingPos();
  this.in = in;
  this.decompressor = codecFactory.getDecompressor(metaData.getCodec());
}
 
Example #22
Source File: ColumnChunkIncReadStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData) throws IOException {
  FSDataInputStream in = fs.open(path);
  streams.add(in);
  in.seek(metaData.getStartingPos());
  ColumnChunkIncPageReader reader = new ColumnChunkIncPageReader(metaData, descriptor, in);

  columns.put(descriptor, reader);
}
 
Example #23
Source File: PredicateUtils.java    From presto with Apache License 2.0 5 votes vote down vote up
private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath)
{
    ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder();
    for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
        Statistics<?> columnStatistics = columnMetaData.getStatistics();
        if (columnStatistics != null) {
            RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
            if (descriptor != null) {
                statistics.put(descriptor, columnStatistics);
            }
        }
    }
    return statistics.build();
}
 
Example #24
Source File: DictionaryPageReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) {
  if (rowGroup != null) {
    // if the row group has already been read, use that dictionary
    return rowGroup.readDictionaryPage(descriptor);
  }

  String dotPath = String.join(".", descriptor.getPath());
  ColumnChunkMetaData column = columns.get(dotPath);
  if (column == null) {
    throw new ParquetDecodingException(
        "Failed to load dictionary, unknown column: " + dotPath);
  }

  return dictionaryPageCache.computeIfAbsent(dotPath, key -> {
    try {
      final DictionaryPage dict =
          hasDictionaryPage(column) ? reader.readDictionary(column) : null;

      // Copy the dictionary to ensure it can be reused if it is returned
      // more than once. This can happen when a DictionaryFilter has two or
      // more predicates for the same column. Cache misses as well.
      return (dict != null) ? Optional.of(reusableCopy(dict)) : Optional.empty();
    } catch (IOException e) {
      throw new ParquetDecodingException("Failed to read dictionary", e);
    }
  }).orElse(null);
}
 
Example #25
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
  Column<T> filterColumn = ltEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never less than or
    // equal to a value. for all x, null is never <= x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v <= someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = ltEq.getValue();

  // drop if value < min
  return stats.compareMinToValue(value) > 0;
}
 
Example #26
Source File: PageReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
PageReader(ColumnReader<?> parentStatus, SeekableInputStream inputStream, Path path, ColumnChunkMetaData columnChunkMetaData) throws ExecutionSetupException {
  this.parentColumnReader = parentStatus;
  allocatedDictionaryBuffers = new ArrayList<>();
  codecFactory = parentColumnReader.parentReader.getCodecFactory();
  this.stats = parentColumnReader.parentReader.parquetReaderStats;
  long start = columnChunkMetaData.getFirstDataPageOffset();
  this.inputStream = inputStream;
  try {
    this.dataReader = new ColumnDataReader(inputStream, start, columnChunkMetaData.getTotalSize());
    loadDictionaryIfExists(parentStatus, columnChunkMetaData, inputStream);
  } catch (IOException e) {
    throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
      + path.getName(), e);
  }
}
 
Example #27
Source File: ColumnSizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}
 
Example #28
Source File: VectorizedArrowReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
  ColumnChunkMetaData chunkMetaData = metadata.get(ColumnPath.get(columnDescriptor.getPath()));
  this.dictionary = vectorizedColumnIterator.setRowGroupInfo(
      source.getPageReader(columnDescriptor),
      !ParquetUtil.hasNonDictionaryPages(chunkMetaData));
}
 
Example #29
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
  long start = rowGroup.getStartingPos();
  long rowCount = rowGroup.getRowCount();
  long compressedSize = rowGroup.getCompressedSize();
  long uncompressedSize = rowGroup.getTotalByteSize();
  String filePath = rowGroup.getPath();

  console.info(String.format("\nRow group %d:  count: %d  %s records  start: %d  total: %s%s\n%s",
      index, rowCount,
      humanReadable(((float) compressedSize) / rowCount),
      start, humanReadable(compressedSize),
      filePath != null ? " path: " + filePath : "",
      new TextStringBuilder(80).appendPadding(80, '-')));

  int size = maxSize(Iterables.transform(rowGroup.getColumns(),
      new Function<ColumnChunkMetaData, String>() {
        @Override
        public String apply(@Nullable ColumnChunkMetaData input) {
          return input == null ? "" : input.getPath().toDotString();
        }
      }));

  console.info(String.format("%-" + size + "s  %-9s %-9s %-9s %-10s %-7s %s",
      "", "type", "encodings", "count", "avg size", "nulls", "min / max"));
  for (ColumnChunkMetaData column : rowGroup.getColumns()) {
    printColumnChunk(console, size, column, schema);
  }
}
 
Example #30
Source File: DictionaryFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
  Column<T> filterColumn = lt.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never less than a
    // value. for all x, null is never < x.
    return BLOCK_CANNOT_MATCH;
  }

  // if the chunk has non-dictionary pages, don't bother decoding the
  // dictionary because the row group can't be eliminated.
  if (hasNonDictionaryPages(meta)) {
    return BLOCK_MIGHT_MATCH;
  }

  T value = lt.getValue();

  try {
    Set<T> dictSet = expandDictionary(meta);
    if (dictSet == null) {
      return BLOCK_MIGHT_MATCH;
    }

    Comparator<T> comparator = meta.getPrimitiveType().comparator();
    for (T entry : dictSet) {
      if (comparator.compare(value, entry) > 0) {
        return BLOCK_MIGHT_MATCH;
      }
    }

    return BLOCK_CANNOT_MATCH;
  } catch (IOException e) {
    LOG.warn("Failed to process dictionary for filter evaluation.", e);
  }

  return BLOCK_MIGHT_MATCH;
}