org.apache.parquet.column.EncodingStats Java Examples

The following examples show how to use org.apache.parquet.column.EncodingStats. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: PredicateUtils.java From presto with Apache License 2.0

6 votes

@VisibleForTesting
@SuppressWarnings("deprecation")
static boolean isOnlyDictionaryEncodingPages(ColumnChunkMetaData columnMetaData)
{
    // Files written with newer versions of Parquet libraries (e.g. parquet-mr 1.9.0) will have EncodingStats available
    // Otherwise, fallback to v1 logic
    EncodingStats stats = columnMetaData.getEncodingStats();
    if (stats != null) {
        return stats.hasDictionaryPages() && !stats.hasNonDictionaryEncodedPages();
    }

    Set<Encoding> encodings = columnMetaData.getEncodings();
    if (encodings.contains(PLAIN_DICTIONARY)) {
        // PLAIN_DICTIONARY was present, which means at least one page was
        // dictionary-encoded and 1.0 encodings are used
        // The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels
        return Sets.difference(encodings, ImmutableSet.of(PLAIN_DICTIONARY, RLE, BIT_PACKED)).isEmpty();
    }

    return false;
}

Example #2

Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0

6 votes

private void testDictionaryEncodedColumnsV2() throws Exception {
  Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList(
      "binary_field", "single_value_field", "optional_single_value_field", "fixed_field", "int32_field",
      "int64_field", "double_field", "float_field", "int96_field"));
  for (ColumnChunkMetaData column : ccmd) {
    EncodingStats encStats = column.getEncodingStats();
    String name = column.getPath().toDotString();
    if (dictionaryEncodedColumns.contains(name)) {
      assertTrue("Column should have dictionary pages: " + name, encStats.hasDictionaryPages());
      assertTrue("Column should have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      assertFalse("Column should not have non-dictionary encoded pages: " + name,
          encStats.hasNonDictionaryEncodedPages());
    } else {
      assertTrue("Column should have non-dictionary encoded pages: " + name,
          encStats.hasNonDictionaryEncodedPages());
      if (name.startsWith("fallback")) {
        assertTrue("Column should have dictionary pages: " + name, encStats.hasDictionaryPages());
        assertTrue("Column should have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      } else {
        assertFalse("Column should not have dictionary pages: " + name, encStats.hasDictionaryPages());
        assertFalse("Column should not have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      }
    }
  }
}

Example #3

Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0

6 votes

public EncodingStats convertEncodingStats(List<PageEncodingStats> stats) {
  if (stats == null) {
    return null;
  }

  EncodingStats.Builder builder = new EncodingStats.Builder();
  for (PageEncodingStats stat : stats) {
    switch (stat.getPage_type()) {
      case DATA_PAGE_V2:
        builder.withV2Pages();
        // falls through
      case DATA_PAGE:
        builder.addDataEncoding(
            getEncoding(stat.getEncoding()), stat.getCount());
        break;
      case DICTIONARY_PAGE:
        builder.addDictEncoding(
            getEncoding(stat.getEncoding()), stat.getCount());
        break;
    }
  }
  return builder.build();
}

Example #4

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
 *
 * @param configuration Hadoop configuration
 * @param schema the schema of the data
 * @param file the file to write to
 * @param rowAndBlockSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 */
ParquetFileWriter(Configuration configuration, MessageType schema,
                  Path file, long rowAndBlockSize, int maxPaddingSize)
    throws IOException {
  FileSystem fs = file.getFileSystem(configuration);
  this.schema = schema;
  this.alignment = PaddingAlignment.get(
      rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
  this.out = HadoopStreams.wrap(
      fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
  this.encodingStatsBuilder = new EncodingStats.Builder();
  // no truncation is needed for testing
  this.columnIndexTruncateLength = Integer.MAX_VALUE;
  this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
  this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}

Example #5

Source File: ColumnChunkMetaData.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param path column identifier
 * @param type type of the column
 * @param codec
 * @param encodings
 * @param statistics
 * @param firstDataPage
 * @param dictionaryPageOffset
 * @param valueCount
 * @param totalSize
 * @param totalUncompressedSize
 */
IntColumnChunkMetaData(
    ColumnPath path,
    PrimitiveType type,
    CompressionCodecName codec,
    EncodingStats encodingStats,
    Set<Encoding> encodings,
    Statistics statistics,
    long firstDataPage,
    long dictionaryPageOffset,
    long valueCount,
    long totalSize,
    long totalUncompressedSize) {
  super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings));
  this.firstDataPage = positiveLongToInt(firstDataPage);
  this.dictionaryPageOffset = positiveLongToInt(dictionaryPageOffset);
  this.valueCount = positiveLongToInt(valueCount);
  this.totalSize = positiveLongToInt(totalSize);
  this.totalUncompressedSize = positiveLongToInt(totalUncompressedSize);
  this.statistics = statistics;
}

Example #6

Source File: ColumnChunkMetaData.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param path column identifier
 * @param type type of the column
 * @param codec
 * @param encodings
 * @param statistics
 * @param firstDataPageOffset
 * @param dictionaryPageOffset
 * @param valueCount
 * @param totalSize
 * @param totalUncompressedSize
 */
LongColumnChunkMetaData(
    ColumnPath path,
    PrimitiveType type,
    CompressionCodecName codec,
    EncodingStats encodingStats,
    Set<Encoding> encodings,
    Statistics statistics,
    long firstDataPageOffset,
    long dictionaryPageOffset,
    long valueCount,
    long totalSize,
    long totalUncompressedSize) {
  super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings));
  this.firstDataPageOffset = firstDataPageOffset;
  this.dictionaryPageOffset = dictionaryPageOffset;
  this.valueCount = valueCount;
  this.totalSize = totalSize;
  this.totalUncompressedSize = totalUncompressedSize;
  this.statistics = statistics;
}

Example #7

Source File: Util.java From parquet-mr with Apache License 2.0

5 votes

public static String encodingStatsAsString(EncodingStats encodingStats) {
  StringBuilder sb = new StringBuilder();
  if (encodingStats.hasDictionaryPages()) {
    for (Encoding encoding: encodingStats.getDictionaryEncodings()) {
      sb.append(encodingAsString(encoding, true));
    }
    sb.append(" ");
  } else {
    sb.append("  ");
  }

  Set<Encoding> encodings = encodingStats.getDataEncodings();
  if (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY)) {
    sb.append("R");
  }
  if (encodings.contains(PLAIN)) {
    sb.append("_");
  }
  if (encodings.contains(DELTA_BYTE_ARRAY) ||
      encodings.contains(DELTA_BINARY_PACKED) ||
      encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
    sb.append("D");
  }

  // Check for fallback and add a flag
  if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) {
    sb.append(" F");
  }

  return sb.toString();
}

Example #8

Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0

5 votes

private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
  String[] path = column.getPath().toArray();
  PrimitiveType type = primitive(schema, path);
  Preconditions.checkNotNull(type);

  ColumnDescriptor desc = schema.getColumnDescription(path);
  long size = column.getTotalSize();
  long count = column.getValueCount();
  float perValue = ((float) size) / count;
  CompressionCodecName codec = column.getCodec();
  Set<Encoding> encodings = column.getEncodings();
  EncodingStats encodingStats = column.getEncodingStats();
  String encodingSummary = encodingStats == null ?
      encodingsAsString(encodings, desc) :
      encodingStatsAsString(encodingStats);
  Statistics stats = column.getStatistics();

  String name = column.getPath().toDotString();

  PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
  if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
        name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
        humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  } else {
    console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s",
        name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
        stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  }
}

Example #9

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

5 votes

private static ParquetMetadata createParquetMetaData(Encoding dicEncoding,
  Encoding dataEncoding) {
  MessageType schema =
    parseMessageType("message schema { optional int32 col (INT_32); }");
  org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
    new org.apache.parquet.hadoop.metadata.FileMetaData(schema,
      new HashMap<String, String>(), null);
  List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
  BlockMetaData blockMetaData = new BlockMetaData();
  EncodingStats.Builder builder = new EncodingStats.Builder();
  if (dicEncoding!= null) {
    builder.addDictEncoding(dicEncoding).build();
  }
  builder.addDataEncoding(dataEncoding);
  EncodingStats es = builder.build();
  Set<org.apache.parquet.column.Encoding> e =
    new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.INT32;
  ColumnPath p = ColumnPath.get("col");
  CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md =
    ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
  blockMetaData.addColumn(md);
  blockMetaDataList.add(blockMetaData);
  return new ParquetMetadata(fileMetaData, blockMetaDataList);
}

Example #10

Source File: DictionaryFilter.java From parquet-mr with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
  EncodingStats stats = meta.getEncodingStats();
  if (stats != null) {
    return stats.hasNonDictionaryEncodedPages();
  }

  // without EncodingStats, fall back to testing the encoding list
  Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
  if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
    // if remove returned true, PLAIN_DICTIONARY was present, which means at
    // least one page was dictionary encoded and 1.0 encodings are used

    // RLE and BIT_PACKED are only used for repetition or definition levels
    encodings.remove(Encoding.RLE);
    encodings.remove(Encoding.BIT_PACKED);

    if (encodings.isEmpty()) {
      return false; // no encodings other than dictionary or rep/def levels
    }

    return true;

  } else {
    // if PLAIN_DICTIONARY wasn't present, then either the column is not
    // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
    // for 2.0, this cannot determine whether a page fell back without
    // page encoding stats
    return true;
  }
}

Example #11

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

/**
 * @param file OutputFile to create or overwrite
 * @param schema the schema of the data
 * @param mode file creation mode
 * @param rowGroupSize the row group size
 * @param maxPaddingSize the maximum padding
 * @param columnIndexTruncateLength the length which the min/max values in column indexes tried to be truncated to
 * @param statisticsTruncateLength the length which the min/max values in row groups tried to be truncated to
 * @param pageWriteChecksumEnabled whether to write out page level checksums
 * @throws IOException if the file can not be created
 */
public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode,
                         long rowGroupSize, int maxPaddingSize, int columnIndexTruncateLength,
                         int statisticsTruncateLength, boolean pageWriteChecksumEnabled)
  throws IOException {
  TypeUtil.checkValidWriteSchema(schema);

  this.schema = schema;

  long blockSize = rowGroupSize;
  if (file.supportsBlockSize()) {
    blockSize = Math.max(file.defaultBlockSize(), rowGroupSize);
    this.alignment = PaddingAlignment.get(blockSize, rowGroupSize, maxPaddingSize);
  } else {
    this.alignment = NoAlignment.get(rowGroupSize);
  }

  if (mode == Mode.OVERWRITE) {
    this.out = file.createOrOverwrite(blockSize);
  } else {
    this.out = file.create(blockSize);
  }

  this.encodingStatsBuilder = new EncodingStats.Builder();
  this.columnIndexTruncateLength = columnIndexTruncateLength;
  this.pageWriteChecksumEnabled = pageWriteChecksumEnabled;
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;

  this.metadataConverter = new ParquetMetadataConverter(statisticsTruncateLength);
}

Example #12

Source File: ColumnChunkMetaData.java From parquet-mr with Apache License 2.0

5 votes

public static ColumnChunkMetaData get(
    ColumnPath path,
    PrimitiveType type,
    CompressionCodecName codec,
    EncodingStats encodingStats,
    Set<Encoding> encodings,
    Statistics statistics,
    long firstDataPage,
    long dictionaryPageOffset,
    long valueCount,
    long totalSize,
    long totalUncompressedSize) {
  // to save space we store those always positive longs in ints when they fit.
  if (positiveLongFitsInAnInt(firstDataPage)
      && positiveLongFitsInAnInt(dictionaryPageOffset)
      && positiveLongFitsInAnInt(valueCount)
      && positiveLongFitsInAnInt(totalSize)
      && positiveLongFitsInAnInt(totalUncompressedSize)) {
    return new IntColumnChunkMetaData(
        path, type, codec,
        encodingStats, encodings,
        statistics,
        firstDataPage,
        dictionaryPageOffset,
        valueCount,
        totalSize,
        totalUncompressedSize);
  } else {
    return new LongColumnChunkMetaData(
        path, type, codec,
        encodingStats, encodings,
        statistics,
        firstDataPage,
        dictionaryPageOffset,
        valueCount,
        totalSize,
        totalUncompressedSize);
  }
}

Example #13

Source File: DictionaryPageReader.java From parquet-mr with Apache License 2.0

5 votes

private boolean hasDictionaryPage(ColumnChunkMetaData column) {
  EncodingStats stats = column.getEncodingStats();
  if (stats != null) {
    // ensure there is a dictionary page and that it is used to encode data pages
    return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
  }

  Set<Encoding> encodings = column.getEncodings();
  return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY));
}

Example #14

Source File: ParquetDictionaryRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
  EncodingStats stats = meta.getEncodingStats();
  if (stats != null) {
    return stats.hasNonDictionaryEncodedPages();
  }

  // without EncodingStats, fall back to testing the encoding list
  Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
  if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
    // if remove returned true, PLAIN_DICTIONARY was present, which means at
    // least one page was dictionary encoded and 1.0 encodings are used

    // RLE and BIT_PACKED are only used for repetition or definition levels
    encodings.remove(Encoding.RLE);
    encodings.remove(Encoding.BIT_PACKED);

    if (encodings.isEmpty()) {
      return false; // no encodings other than dictionary or rep/def levels
    }

    return true;

  } else {
    // if PLAIN_DICTIONARY wasn't present, then either the column is not
    // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
    // for 2.0, this cannot determine whether a page fell back without
    // page encoding stats
    return true;
  }
}

Example #15

Source File: ParquetUtil.java From iceberg with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
  EncodingStats stats = meta.getEncodingStats();
  if (stats != null) {
    return stats.hasNonDictionaryEncodedPages();
  }

  // without EncodingStats, fall back to testing the encoding list
  Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
  if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
    // if remove returned true, PLAIN_DICTIONARY was present, which means at
    // least one page was dictionary encoded and 1.0 encodings are used

    // RLE and BIT_PACKED are only used for repetition or definition levels
    encodings.remove(Encoding.RLE);
    encodings.remove(Encoding.BIT_PACKED);

    // when empty, no encodings other than dictionary or rep/def levels
    return !encodings.isEmpty();
  } else {
    // if PLAIN_DICTIONARY wasn't present, then either the column is not
    // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
    // for 2.0, this cannot determine whether a page fell back without
    // page encoding stats
    return true;
  }
}

Example #16

Source File: TestPredicateUtils.java From presto with Apache License 2.0

5 votes

private ColumnChunkMetaData createColumnMetaDataV2(Encoding... dataEncodings)
{
    EncodingStats encodingStats = new EncodingStats.Builder()
            .withV2Pages()
            .addDictEncoding(PLAIN)
            .addDataEncodings(ImmutableSet.copyOf(dataEncodings)).build();

    return ColumnChunkMetaData.get(fromDotString("column"), BINARY, UNCOMPRESSED, encodingStats, encodingStats.getDataEncodings(), new BinaryStatistics(), 0, 0, 1, 1, 1);
}

Example #17

Source File: ColumnChunkMetaData.java From parquet-mr with Apache License 2.0

4 votes

public EncodingStats getEncodingStats() {
  return encodingStats;
}

Example #18

Source File: ColumnChunkMetaData.java From parquet-mr with Apache License 2.0

4 votes

protected ColumnChunkMetaData(EncodingStats encodingStats, ColumnChunkProperties columnChunkProperties) {
  this.encodingStats = encodingStats;
  this.properties = columnChunkProperties;
}

Example #19

Source File: ColumnChunkMetaData.java From parquet-mr with Apache License 2.0

4 votes

/**
 * @param path the path of this column in the write schema
 * @param type primitive type for this column
 * @param codec the compression codec used to compress
 * @param encodingStats EncodingStats for the encodings used in this column
 * @param encodings a set of encoding used in this column
 * @param statistics statistics for the data in this column
 * @param firstDataPage offset of the first non-dictionary page
 * @param dictionaryPageOffset offset of the the dictionary page
 * @param valueCount number of values
 * @param totalSize total compressed size
 * @param totalUncompressedSize uncompressed data size
 * @return a column chunk metadata instance
 * @deprecated will be removed in 2.0.0. Use
 *             {@link #get(ColumnPath, PrimitiveType, CompressionCodecName, EncodingStats, Set, Statistics, long, long, long, long, long)}
 *             instead.
 */
@Deprecated
public static ColumnChunkMetaData get(
    ColumnPath path,
    PrimitiveTypeName type,
    CompressionCodecName codec,
    EncodingStats encodingStats,
    Set<Encoding> encodings,
    Statistics statistics,
    long firstDataPage,
    long dictionaryPageOffset,
    long valueCount,
    long totalSize,
    long totalUncompressedSize) {
  return get(path, Types.optional(type).named("fake_type"), codec, encodingStats, encodings, statistics,
      firstDataPage, dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize);
}

Example #20

Source File: TestReadWriteEncodingStats.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testReadWrite() throws Exception {
  File file = temp.newFile("encoding-stats.parquet");
  assertTrue(file.delete());
  Path path = new Path(file.toString());

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriterVersion(PARQUET_1_0)
      .withPageSize(1024) // ensure multiple pages are written
      .enableDictionaryEncoding()
      .withDictionaryPageSize(2*1024)
      .withConf(CONF)
      .withType(SCHEMA)
      .build();
  writeData(writer);
  writer.close();

  ParquetFileReader reader = ParquetFileReader.open(CONF, path);
  assertEquals("Should have one row group", 1, reader.getRowGroups().size());
  BlockMetaData rowGroup = reader.getRowGroups().get(0);

  ColumnChunkMetaData dictColumn = rowGroup.getColumns().get(0);
  EncodingStats dictStats = dictColumn.getEncodingStats();
  assertNotNull("Dict column should have non-null encoding stats", dictStats);
  assertTrue("Dict column should have a dict page", dictStats.hasDictionaryPages());
  assertTrue("Dict column should have dict-encoded pages", dictStats.hasDictionaryEncodedPages());
  assertFalse("Dict column should not have non-dict pages", dictStats.hasNonDictionaryEncodedPages());

  ColumnChunkMetaData plainColumn = rowGroup.getColumns().get(1);
  EncodingStats plainStats = plainColumn.getEncodingStats();
  assertNotNull("Plain column should have non-null encoding stats", plainStats);
  assertFalse("Plain column should not have a dict page", plainStats.hasDictionaryPages());
  assertFalse("Plain column should not have dict-encoded pages", plainStats.hasDictionaryEncodedPages());
  assertTrue("Plain column should have non-dict pages", plainStats.hasNonDictionaryEncodedPages());

  ColumnChunkMetaData fallbackColumn = rowGroup.getColumns().get(2);
  EncodingStats fallbackStats = fallbackColumn.getEncodingStats();
  assertNotNull("Fallback column should have non-null encoding stats", fallbackStats);
  assertTrue("Fallback column should have a dict page", fallbackStats.hasDictionaryPages());
  assertTrue("Fallback column should have dict-encoded pages", fallbackStats.hasDictionaryEncodedPages());
  assertTrue("Fallback column should have non-dict pages", fallbackStats.hasNonDictionaryEncodedPages());
}