Java Code Examples for org.apache.parquet.column.page.DictionaryPage#getDictionarySize()

The following examples show how to use org.apache.parquet.column.page.DictionaryPage#getDictionarySize() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
  if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
      !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
    return null;
  }

  // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
  if (f.getPos() != meta.getStartingPos()) {
    f.seek(meta.getStartingPos());
  }

  PageHeader pageHeader = Util.readPageHeader(f);
  if (!pageHeader.isSetDictionary_page_header()) {
    return null; // TODO: should this complain?
  }

  DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
  BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

  return new DictionaryPage(
      decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
      compressedPage.getDictionarySize(),
      compressedPage.getEncoding());
}
 
Example 2
Source File: ShowPagesCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String printDictionaryPage(DictionaryPage dict) {
  // TODO: the compressed size of a dictionary page is lost in Parquet
  dict.getUncompressedSize();
  long totalSize = dict.getCompressedSize();
  int count = dict.getDictionarySize();
  float perValue = ((float) totalSize) / count;
  String enc = encodingAsString(dict.getEncoding(), true);
  if (pageNum == 0) {
    return String.format("%3d-D    %-5s %s %-2s %-7d %-10s %-10s",
        rowGroupNum, "dict", shortCodec, enc, count, humanReadable(perValue),
        humanReadable(totalSize));
  } else {
    return String.format("%3d-%-3d  %-5s %s %-2s %-7d %-10s %-10s",
        rowGroupNum, pageNum, "dict", shortCodec, enc, count, humanReadable(perValue),
        humanReadable(totalSize));
  }
}
 
Example 3
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page is allowed");
  }
  BytesInput dictionaryBytes = dictionaryPage.getBytes();
  int uncompressedSize = (int)dictionaryBytes.size();
  BytesInput compressedBytes = compressor.compress(dictionaryBytes);
  this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize,
      dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
 
Example 4
Source File: PlainValuesDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Decodes {@link Binary} values from a {@link DictionaryPage}.
 *
 * If the given {@code length} is null, the values will be read as length-
 * prefixed values with a 4-byte little-endian length. If length is not
 * null, it will be used as the length for all fixed-length {@code Binary}
 * values read from the page.
 *
 * @param dictionaryPage a {@code DictionaryPage} of encoded binary values
 * @param length a fixed length of binary arrays, or null if not fixed
 * @throws IOException if there is an exception while decoding the dictionary page
 */
public PlainBinaryDictionary(DictionaryPage dictionaryPage, Integer length) throws IOException {
  super(dictionaryPage);
  final ByteBuffer dictionaryBytes = dictionaryPage.getBytes().toByteBuffer();
  binaryDictionaryContent = new Binary[dictionaryPage.getDictionarySize()];
  // dictionary values are stored in order: size (4 bytes LE) followed by {size} bytes
  int offset = dictionaryBytes.position();
  if (length == null) {
    // dictionary values are stored in order: size (4 bytes LE) followed by {size} bytes
    for (int i = 0; i < binaryDictionaryContent.length; i++) {
      int len = readIntLittleEndian(dictionaryBytes, offset);
      // read the length
      offset += 4;
      // wrap the content in a binary
      binaryDictionaryContent[i] = Binary.fromConstantByteBuffer(dictionaryBytes, offset, len);
      // increment to the next value
      offset += len;
    }
  } else {
    // dictionary values are stored as fixed-length arrays
    Preconditions.checkArgument(length > 0,
        "Invalid byte array length: " + length);
    for (int i = 0; i < binaryDictionaryContent.length; i++) {
      // wrap the content in a Binary
      binaryDictionaryContent[i] = Binary.fromConstantByteBuffer(
          dictionaryBytes, offset, length);
      // increment to the next value
      offset += length;
    }
  }
}
 
Example 5
Source File: PlainValuesDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param dictionaryPage a dictionary page of encoded long values
 * @throws IOException if there is an exception while decoding the dictionary page
 */
public PlainLongDictionary(DictionaryPage dictionaryPage) throws IOException {
  super(dictionaryPage);
  ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
  longDictionaryContent = new long[dictionaryPage.getDictionarySize()];
  LongPlainValuesReader longReader = new LongPlainValuesReader();
  longReader.initFromPage(dictionaryPage.getDictionarySize(), in);
  for (int i = 0; i < longDictionaryContent.length; i++) {
    longDictionaryContent[i] = longReader.readLong();
  }
}
 
Example 6
Source File: PlainValuesDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param dictionaryPage a dictionary page of encoded double values
 * @throws IOException if there is an exception while decoding the dictionary page
 */
public PlainDoubleDictionary(DictionaryPage dictionaryPage) throws IOException {
  super(dictionaryPage);
  ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
  doubleDictionaryContent = new double[dictionaryPage.getDictionarySize()];
  DoublePlainValuesReader doubleReader = new DoublePlainValuesReader();
  doubleReader.initFromPage(dictionaryPage.getDictionarySize(), in);
  for (int i = 0; i < doubleDictionaryContent.length; i++) {
    doubleDictionaryContent[i] = doubleReader.readDouble();
  }
}
 
Example 7
Source File: PlainValuesDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param dictionaryPage a dictionary page of encoded integer values
 * @throws IOException if there is an exception while decoding the dictionary page
 */
public PlainIntegerDictionary(DictionaryPage dictionaryPage) throws IOException {
  super(dictionaryPage);
  ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
  intDictionaryContent = new int[dictionaryPage.getDictionarySize()];
  IntegerPlainValuesReader intReader = new IntegerPlainValuesReader();
  intReader.initFromPage(dictionaryPage.getDictionarySize(), in);
  for (int i = 0; i < intDictionaryContent.length; i++) {
    intDictionaryContent[i] = intReader.readInteger();
  }
}
 
Example 8
Source File: PlainValuesDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param dictionaryPage a dictionary page of encoded float values
 * @throws IOException if there is an exception while decoding the dictionary page
 */
public PlainFloatDictionary(DictionaryPage dictionaryPage) throws IOException {
  super(dictionaryPage);
  ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
  floatDictionaryContent = new float[dictionaryPage.getDictionarySize()];
  FloatPlainValuesReader floatReader = new FloatPlainValuesReader();
  floatReader.initFromPage(dictionaryPage.getDictionarySize(), in);
  for (int i = 0; i < floatDictionaryContent.length; i++) {
    floatDictionaryContent[i] = floatReader.readFloat();
  }
}
 
Example 9
Source File: ColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page is allowed");
  }
  BytesInput dictionaryBytes = dictionaryPage.getBytes();
  int uncompressedSize = (int)dictionaryBytes.size();
  BytesInput compressedBytes = compressor.compress(dictionaryBytes);
  this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize, dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
 
Example 10
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static DictionaryPage reusableCopy(DictionaryPage dict) {
  if (dict == null) {
    return null;
  }
  try {
    return new DictionaryPage(
        BytesInput.from(dict.getBytes().toByteArray()),
        dict.getDictionarySize(), dict.getEncoding());
  } catch (IOException e) {
    throw new ParquetDecodingException("Cannot read dictionary", e);
  }
}
 
Example 11
Source File: DictionaryPageReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private static DictionaryPage reusableCopy(DictionaryPage dict)
    throws IOException {
  return new DictionaryPage(BytesInput.from(dict.getBytes().toByteArray()),
      dict.getDictionarySize(), dict.getEncoding());
}