Java Code Examples for org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getEncodings()

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getEncodings() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
@VisibleForTesting
@SuppressWarnings("deprecation")
static boolean isOnlyDictionaryEncodingPages(ColumnChunkMetaData columnMetaData)
{
    // Files written with newer versions of Parquet libraries (e.g. parquet-mr 1.9.0) will have EncodingStats available
    // Otherwise, fallback to v1 logic
    EncodingStats stats = columnMetaData.getEncodingStats();
    if (stats != null) {
        return stats.hasDictionaryPages() && !stats.hasNonDictionaryEncodedPages();
    }

    Set<Encoding> encodings = columnMetaData.getEncodings();
    if (encodings.contains(PLAIN_DICTIONARY)) {
        // PLAIN_DICTIONARY was present, which means at least one page was
        // dictionary-encoded and 1.0 encodings are used
        // The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels
        return Sets.difference(encodings, ImmutableSet.of(PLAIN_DICTIONARY, RLE, BIT_PACKED)).isEmpty();
    }

    return false;
}
 
Example 2
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
  EncodingStats stats = meta.getEncodingStats();
  if (stats != null) {
    return stats.hasNonDictionaryEncodedPages();
  }

  // without EncodingStats, fall back to testing the encoding list
  Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
  if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
    // if remove returned true, PLAIN_DICTIONARY was present, which means at
    // least one page was dictionary encoded and 1.0 encodings are used

    // RLE and BIT_PACKED are only used for repetition or definition levels
    encodings.remove(Encoding.RLE);
    encodings.remove(Encoding.BIT_PACKED);

    // when empty, no encodings other than dictionary or rep/def levels
    return !encodings.isEmpty();
  } else {
    // if PLAIN_DICTIONARY wasn't present, then either the column is not
    // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
    // for 2.0, this cannot determine whether a page fell back without
    // page encoding stats
    return true;
  }
}
 
Example 3
Source File: ParquetDictionaryRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
  EncodingStats stats = meta.getEncodingStats();
  if (stats != null) {
    return stats.hasNonDictionaryEncodedPages();
  }

  // without EncodingStats, fall back to testing the encoding list
  Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
  if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
    // if remove returned true, PLAIN_DICTIONARY was present, which means at
    // least one page was dictionary encoded and 1.0 encodings are used

    // RLE and BIT_PACKED are only used for repetition or definition levels
    encodings.remove(Encoding.RLE);
    encodings.remove(Encoding.BIT_PACKED);

    if (encodings.isEmpty()) {
      return false; // no encodings other than dictionary or rep/def levels
    }

    return true;

  } else {
    // if PLAIN_DICTIONARY wasn't present, then either the column is not
    // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
    // for 2.0, this cannot determine whether a page fell back without
    // page encoding stats
    return true;
  }
}
 
Example 4
Source File: DictionaryPageReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private boolean hasDictionaryPage(ColumnChunkMetaData column) {
  EncodingStats stats = column.getEncodingStats();
  if (stats != null) {
    // ensure there is a dictionary page and that it is used to encode data pages
    return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
  }

  Set<Encoding> encodings = column.getEncodings();
  return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY));
}
 
Example 5
Source File: DictionaryFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
  EncodingStats stats = meta.getEncodingStats();
  if (stats != null) {
    return stats.hasNonDictionaryEncodedPages();
  }

  // without EncodingStats, fall back to testing the encoding list
  Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
  if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
    // if remove returned true, PLAIN_DICTIONARY was present, which means at
    // least one page was dictionary encoded and 1.0 encodings are used

    // RLE and BIT_PACKED are only used for repetition or definition levels
    encodings.remove(Encoding.RLE);
    encodings.remove(Encoding.BIT_PACKED);

    if (encodings.isEmpty()) {
      return false; // no encodings other than dictionary or rep/def levels
    }

    return true;

  } else {
    // if PLAIN_DICTIONARY wasn't present, then either the column is not
    // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
    // for 2.0, this cannot determine whether a page fell back without
    // page encoding stats
    return true;
  }
}
 
Example 6
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
  String[] path = column.getPath().toArray();
  PrimitiveType type = primitive(schema, path);
  Preconditions.checkNotNull(type);

  ColumnDescriptor desc = schema.getColumnDescription(path);
  long size = column.getTotalSize();
  long count = column.getValueCount();
  float perValue = ((float) size) / count;
  CompressionCodecName codec = column.getCodec();
  Set<Encoding> encodings = column.getEncodings();
  EncodingStats encodingStats = column.getEncodingStats();
  String encodingSummary = encodingStats == null ?
      encodingsAsString(encodings, desc) :
      encodingStatsAsString(encodingStats);
  Statistics stats = column.getStatistics();

  String name = column.getPath().toDotString();

  PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
  if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
        name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
        humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  } else {
    console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s",
        name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
        stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  }
}