Java Code Examples for org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getStatistics()

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getStatistics() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PredicateUtils.java    From presto with Apache License 2.0 5 votes vote down vote up
private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath)
{
    ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder();
    for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
        Statistics<?> columnStatistics = columnMetaData.getStatistics();
        if (columnStatistics != null) {
            RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
            if (descriptor != null) {
                statistics.put(descriptor, columnStatistics);
            }
        }
    }
    return statistics.build();
}
 
Example 2
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) {
  long doff = meta.getDictionaryPageOffset();
  long foff = meta.getFirstDataPageOffset();
  long tsize = meta.getTotalSize();
  long usize = meta.getTotalUncompressedSize();
  long count = meta.getValueCount();
  double ratio = usize / (double)tsize;
  String encodings = Joiner.on(',').skipNulls().join(meta.getEncodings());

  if (name) {
    String path = Joiner.on('.').skipNulls().join(meta.getPath());
    out.format("%s: ", path);
  }

  out.format(" %s", meta.getType());
  out.format(" %s", meta.getCodec());
  out.format(" DO:%d", doff);
  out.format(" FPO:%d", foff);
  out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio);
  out.format(" VC:%d", count);
  if (!encodings.isEmpty()) out.format(" ENC:%s", encodings);
  Statistics<?> stats = meta.getStatistics();
  if (stats != null) {
    out.format(" ST:[%s]", stats);
  } else {
    out.format(" ST:[none]");
  }
  out.println();
}
 
Example 3
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) {
  long doff = meta.getDictionaryPageOffset();
  long foff = meta.getFirstDataPageOffset();
  long tsize = meta.getTotalSize();
  long usize = meta.getTotalUncompressedSize();
  long count = meta.getValueCount();
  double ratio = usize / (double)tsize;
  String encodings = Joiner.on(',').skipNulls().join(meta.getEncodings());

  if (name) {
    String path = Joiner.on('.').skipNulls().join(meta.getPath());
    out.format("%s: ", path);
  }

  out.format(" %s", meta.getType());
  out.format(" %s", meta.getCodec());
  out.format(" DO:%d", doff);
  out.format(" FPO:%d", foff);
  out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio);
  out.format(" VC:%d", count);
  if (!encodings.isEmpty()) out.format(" ENC:%s", encodings);
  Statistics<?> stats = meta.getStatistics();
  if (stats != null) {
    out.format(" ST:[%s]", stats);
  } else {
    out.format(" ST:[none]");
  }
  out.println();
}
 
Example 4
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
  Column<T> filterColumn = lt.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never less than a
    // value. for all x, null is never < x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v < someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = lt.getValue();

  // drop if value <= min
  return stats.compareMinToValue(value) >= 0;
}
 
Example 5
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
  Column<T> filterColumn = ltEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never less than or
    // equal to a value. for all x, null is never <= x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v <= someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = ltEq.getValue();

  // drop if value < min
  return stats.compareMinToValue(value) > 0;
}
 
Example 6
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
  Column<T> filterColumn = gt.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never greater than a
    // value. for all x, null is never > x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v > someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = gt.getValue();

  // drop if value >= max
  return stats.compareMaxToValue(value) <= 0;
}
 
Example 7
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) {
  Column<T> filterColumn = gtEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never greater than or
    // equal to a value. for all x, null is never >= x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v >= someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = gtEq.getValue();

  // drop if value > max
  return stats.compareMaxToValue(value) < 0;
}
 
Example 8
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
  String[] path = column.getPath().toArray();
  PrimitiveType type = primitive(schema, path);
  Preconditions.checkNotNull(type);

  ColumnDescriptor desc = schema.getColumnDescription(path);
  long size = column.getTotalSize();
  long count = column.getValueCount();
  float perValue = ((float) size) / count;
  CompressionCodecName codec = column.getCodec();
  Set<Encoding> encodings = column.getEncodings();
  EncodingStats encodingStats = column.getEncodingStats();
  String encodingSummary = encodingStats == null ?
      encodingsAsString(encodings, desc) :
      encodingStatsAsString(encodingStats);
  Statistics stats = column.getStatistics();

  String name = column.getPath().toDotString();

  PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
  if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
        name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
        humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  } else {
    console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s",
        name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
        stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  }
}
 
Example 9
Source File: ParquetDictionaryRowGroupFilter.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private static boolean mayContainNull(ColumnChunkMetaData meta) {
  return meta.getStatistics() == null || meta.getStatistics().getNumNulls() != 0;
}
 
Example 10
Source File: Metadata.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}
 
Example 11
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
  Column<T> filterColumn = eq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  T value = eq.getValue();

  if (meta == null) {
    // the column isn't in this file so all values are null.
    if (value != null) {
      // non-null is never null
      return BLOCK_CANNOT_MATCH;
    }
    return BLOCK_MIGHT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (value == null) {
    // We don't know anything about the nulls in this chunk
    if (!stats.isNumNullsSet()) {
      return BLOCK_MIGHT_MATCH;
    }
    // we are looking for records where v eq(null)
    // so drop if there are no nulls in this chunk
    return !hasNulls(meta);
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v eq(someNonNull)
    // and this is a column of all nulls, so drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  // drop if value < min || value > max
  return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0;
}
 
Example 12
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
  Column<T> filterColumn = notEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  T value = notEq.getValue();

  if (meta == null) {
    if (value == null) {
      // null is always equal to null
      return BLOCK_CANNOT_MATCH;
    }
    return BLOCK_MIGHT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (value == null) {
    // we are looking for records where v notEq(null)
    // so, if this is a column of all nulls, we can drop it
    return isAllNulls(meta);
  }

  if (stats.isNumNullsSet() && hasNulls(meta)) {
    // we are looking for records where v notEq(someNonNull)
    // but this chunk contains nulls, we cannot drop it
    return BLOCK_MIGHT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  // drop if this is a column where min = max = value
  return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0;
}
 
Example 13
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(UserDefined<T, U> ud, boolean inverted) {
  Column<T> filterColumn = ud.getColumn();
  ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
  U udp = ud.getUserDefinedPredicate();

  if (columnChunk == null) {
    // the column isn't in this file so all values are null.
    // lets run the udp with null value to see if it keeps null or not.
    if (inverted) {
      return udp.acceptsNullValue();
    } else {
      return !udp.acceptsNullValue();
    }
  }

  Statistics<T> stats = columnChunk.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(columnChunk)) {
    // lets run the udp with null value to see if it keeps null or not.
    if (inverted) {
      return udp.acceptsNullValue();
    } else {
      return !udp.acceptsNullValue();
    }
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
    new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(),
      stats.comparator());

  if (inverted) {
    return udp.inverseCanDrop(udpStats);
  } else {
    return udp.canDrop(udpStats);
  }
}
 
Example 14
Source File: DictionaryFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
  Column<T> filterColumn = notEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  T value = notEq.getValue();

  if (value == null && meta == null) {
    // the predicate value is null and all rows have a null value, so the
    // predicate is always false (null != null)
    return BLOCK_CANNOT_MATCH;
  }

  if (value == null) {
    // the dictionary contains only non-null values so isn't helpful. this
    // could check the column stats, but the StatisticsFilter is responsible
    return BLOCK_MIGHT_MATCH;
  }

  if (meta == null) {
    // column is missing from this file and is always null and not equal to
    // the non-null test value, so the predicate is true for all rows
    return BLOCK_MIGHT_MATCH;
  }

  // if the chunk has non-dictionary pages, don't bother decoding the
  // dictionary because the row group can't be eliminated.
  if (hasNonDictionaryPages(meta)) {
    return BLOCK_MIGHT_MATCH;
  }

  try {
    Set<T> dictSet = expandDictionary(meta);
    boolean mayContainNull = (meta.getStatistics() == null
        || !meta.getStatistics().isNumNullsSet()
        || meta.getStatistics().getNumNulls() > 0);
    if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value) && !mayContainNull) {
      return BLOCK_CANNOT_MATCH;
    }
  } catch (IOException e) {
    LOG.warn("Failed to process dictionary for filter evaluation.", e);
  }

  return BLOCK_MIGHT_MATCH;
}