org.apache.parquet.format.DataPageHeaderV2 Java Examples

The following examples show how to use org.apache.parquet.format.DataPageHeaderV2. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetColumnChunk.java    From presto with Apache License 2.0 6 votes vote down vote up
private long readDataPageV2(
        PageHeader pageHeader,
        int uncompressedPageSize,
        int compressedPageSize,
        List<DataPage> pages)
{
    DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2();
    int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length();
    pages.add(new DataPageV2(
            dataHeaderV2.getNum_rows(),
            dataHeaderV2.getNum_nulls(),
            dataHeaderV2.getNum_values(),
            getSlice(dataHeaderV2.getRepetition_levels_byte_length()),
            getSlice(dataHeaderV2.getDefinition_levels_byte_length()),
            getParquetEncoding(Encoding.valueOf(dataHeaderV2.getEncoding().name())),
            getSlice(dataSize),
            uncompressedPageSize,
            MetadataReader.readStats(
                    fileCreatedBy,
                    Optional.ofNullable(dataHeaderV2.getStatistics()),
                    descriptor.getColumnDescriptor().getPrimitiveType()),
            dataHeaderV2.isIs_compressed()));
    return dataHeaderV2.getNum_values();
}
 
Example #2
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private PageHeader newDataPageV2Header(
    int uncompressedSize, int compressedSize,
    int valueCount, int nullCount, int rowCount,
    org.apache.parquet.column.Encoding dataEncoding,
    int rlByteLength, int dlByteLength) {
  // TODO: pageHeader.crc = ...;
  DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2(
      valueCount, nullCount, rowCount,
      getEncoding(dataEncoding),
      dlByteLength, rlByteLength);
  PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize);
  pageHeader.setData_page_header_v2(dataPageHeaderV2);
  return pageHeader;
}
 
Example #3
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<Long> getOffsets(TransParquetFileReader reader, ColumnChunkMetaData chunk) throws IOException {
  List<Long> offsets = new ArrayList<>();
  reader.setStreamPosition(chunk.getStartingPos());
  long readValues = 0;
  long totalChunkValues = chunk.getValueCount();
  while (readValues < totalChunkValues) {
    long curOffset = reader.getPos();
    PageHeader pageHeader = reader.readPageHeader();
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader);
        break;
      case DATA_PAGE:
        DataPageHeader headerV1 = pageHeader.data_page_header;
        offsets.add(curOffset);
        compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader);
        readValues += headerV1.getNum_values();
        break;
      case DATA_PAGE_V2:
        DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
        offsets.add(curOffset);
        int rlLength = headerV2.getRepetition_levels_byte_length();
        compressionConverter.readBlock(rlLength, reader);
        int dlLength = headerV2.getDefinition_levels_byte_length();
        compressionConverter.readBlock(dlLength, reader);
        int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
        compressionConverter.readBlock(payLoadLength, reader);
        readValues += headerV2.getNum_values();
        break;
      default:
        throw new IOException("Not recognized page type");
    }
  }
  return offsets;
}
 
Example #4
Source File: CompressionConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void processChunk(TransParquetFileReader reader, ParquetFileWriter writer, ColumnChunkMetaData chunk,
                          String createdBy, CompressionCodecName codecName) throws IOException {
  CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0);
  CompressionCodecFactory.BytesInputDecompressor decompressor = codecFactory.getDecompressor(chunk.getCodec());
  CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(codecName);
  ColumnIndex columnIndex = reader.readColumnIndex(chunk);
  OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);

  reader.setStreamPosition(chunk.getStartingPos());
  DictionaryPage dictionaryPage = null;
  long readValues = 0;
  Statistics statistics = null;
  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  int pageIndex = 0;
  long totalChunkValues = chunk.getValueCount();
  while (readValues < totalChunkValues) {
    PageHeader pageHeader = reader.readPageHeader();
    int compressedPageSize = pageHeader.getCompressed_page_size();
    byte[] pageLoad;
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        if (dictionaryPage != null) {
          throw new IOException("has more than one dictionary page in column chunk");
        }
        DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
        pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
        writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad),
                                                 pageHeader.getUncompressed_page_size(),
                                                 dictPageHeader.getNum_values(),
                                                 converter.getEncoding(dictPageHeader.getEncoding())));
        break;
      case DATA_PAGE:
        DataPageHeader headerV1 = pageHeader.data_page_header;
        pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
        statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV1.getStatistics(), columnIndex, pageIndex, converter);
        readValues += headerV1.getNum_values();
        if (offsetIndex != null) {
          long rowCount = 1 + offsetIndex.getLastRowIndex(pageIndex, totalChunkValues) - offsetIndex.getFirstRowIndex(pageIndex);
          writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()),
            pageHeader.getUncompressed_page_size(),
            BytesInput.from(pageLoad),
            statistics,
            toIntWithCheck(rowCount),
            converter.getEncoding(headerV1.getRepetition_level_encoding()),
            converter.getEncoding(headerV1.getDefinition_level_encoding()),
            converter.getEncoding(headerV1.getEncoding()));
        } else {
          writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()),
            pageHeader.getUncompressed_page_size(),
            BytesInput.from(pageLoad),
            statistics,
            converter.getEncoding(headerV1.getRepetition_level_encoding()),
            converter.getEncoding(headerV1.getDefinition_level_encoding()),
            converter.getEncoding(headerV1.getEncoding()));
        }
        pageIndex++;
        break;
      case DATA_PAGE_V2:
        DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
        int rlLength = headerV2.getRepetition_levels_byte_length();
        BytesInput rlLevels = readBlockAllocate(rlLength, reader);
        int dlLength = headerV2.getDefinition_levels_byte_length();
        BytesInput dlLevels = readBlockAllocate(dlLength, reader);
        int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
        int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
        pageLoad = translatePageLoad(reader, headerV2.is_compressed, compressor, decompressor, payLoadLength, rawDataLength);
        statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV2.getStatistics(), columnIndex, pageIndex, converter);
        readValues += headerV2.getNum_values();
        writer.writeDataPageV2(headerV2.getNum_rows(),
          headerV2.getNum_nulls(),
          headerV2.getNum_values(),
          rlLevels,
          dlLevels,
          converter.getEncoding(headerV2.getEncoding()),
          BytesInput.from(pageLoad),
          rawDataLength,
          statistics);
        pageIndex++;
        break;
      default:
        LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
        break;
    }
  }
}