org.apache.parquet.format.PageHeader Java Examples

The following examples show how to use org.apache.parquet.format.PageHeader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PageReader.java    From Bats with Apache License 2.0 7 votes vote down vote up
private void readDictionaryPage(final PageHeader pageHeader,
                                final ColumnReader<?> parentStatus) throws IOException {
  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();

  final DrillBuf dictionaryData = readPage(pageHeader, compressedSize, uncompressedSize);
  allocatedDictionaryBuffers.add(dictionaryData);

  DictionaryPage page = new DictionaryPage(
      asBytesInput(dictionaryData, 0, uncompressedSize),
      pageHeader.uncompressed_page_size,
      pageHeader.dictionary_page_header.num_values,
      valueOf(pageHeader.dictionary_page_header.encoding.name()));

  this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page);
}
 
Example #2
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private PageHeader newDataPageHeader(
    int uncompressedSize, int compressedSize,
    int valueCount,
    org.apache.parquet.column.Encoding rlEncoding,
    org.apache.parquet.column.Encoding dlEncoding,
    org.apache.parquet.column.Encoding valuesEncoding,
    int crc) {
  PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize);
  pageHeader.setCrc(crc);
  pageHeader.setData_page_header(new DataPageHeader(
      valueCount,
      getEncoding(valuesEncoding),
      getEncoding(dlEncoding),
      getEncoding(rlEncoding)));
  return pageHeader;
}
 
Example #3
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
  if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
      !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
    return null;
  }

  // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
  if (f.getPos() != meta.getStartingPos()) {
    f.seek(meta.getStartingPos());
  }

  PageHeader pageHeader = Util.readPageHeader(f);
  if (!pageHeader.isSetDictionary_page_header()) {
    return null; // TODO: should this complain?
  }

  DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
  BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

  return new DictionaryPage(
      decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
      compressedPage.getDictionarySize(),
      compressedPage.getEncoding());
}
 
Example #4
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private DictionaryPage readCompressedDictionary(
    PageHeader pageHeader, SeekableInputStream fin) throws IOException {
  DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();

  int uncompressedPageSize = pageHeader.getUncompressed_page_size();
  int compressedPageSize = pageHeader.getCompressed_page_size();

  byte [] dictPageBytes = new byte[compressedPageSize];
  fin.readFully(dictPageBytes);

  BytesInput bin = BytesInput.from(dictPageBytes);

  return new DictionaryPage(
      bin, uncompressedPageSize, dictHeader.getNum_values(),
      converter.getEncoding(dictHeader.getEncoding()));
}
 
Example #5
Source File: ParquetColumnChunk.java    From presto with Apache License 2.0 6 votes vote down vote up
private long readDataPageV2(
        PageHeader pageHeader,
        int uncompressedPageSize,
        int compressedPageSize,
        List<DataPage> pages)
{
    DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2();
    int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length();
    pages.add(new DataPageV2(
            dataHeaderV2.getNum_rows(),
            dataHeaderV2.getNum_nulls(),
            dataHeaderV2.getNum_values(),
            getSlice(dataHeaderV2.getRepetition_levels_byte_length()),
            getSlice(dataHeaderV2.getDefinition_levels_byte_length()),
            getParquetEncoding(Encoding.valueOf(dataHeaderV2.getEncoding().name())),
            getSlice(dataSize),
            uncompressedPageSize,
            MetadataReader.readStats(
                    fileCreatedBy,
                    Optional.ofNullable(dataHeaderV2.getStatistics()),
                    descriptor.getColumnDescriptor().getPrimitiveType()),
            dataHeaderV2.isIs_compressed()));
    return dataHeaderV2.getNum_values();
}
 
Example #6
Source File: ParquetColumnChunk.java    From presto with Apache License 2.0 6 votes vote down vote up
private long readDataPageV1(
        PageHeader pageHeader,
        int uncompressedPageSize,
        int compressedPageSize,
        List<DataPage> pages)
{
    DataPageHeader dataHeaderV1 = pageHeader.getData_page_header();
    pages.add(new DataPageV1(
            getSlice(compressedPageSize),
            dataHeaderV1.getNum_values(),
            uncompressedPageSize,
            getParquetEncoding(Encoding.valueOf(dataHeaderV1.getRepetition_level_encoding().name())),
            getParquetEncoding(Encoding.valueOf(dataHeaderV1.getDefinition_level_encoding().name())),
            getParquetEncoding(Encoding.valueOf(dataHeaderV1.getEncoding().name()))));
    return dataHeaderV1.getNum_values();
}
 
Example #7
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
protected PageHeader readPageHeader() throws IOException {
  PageHeader pageHeader;
  stream.mark(8192); // headers should not be larger than 8k
  try {
    pageHeader = Util.readPageHeader(stream);
  } catch (IOException e) {
    // this is to workaround a bug where the compressedLength
    // of the chunk is missing the size of the header of the dictionary
    // to allow reading older files (using dictionary) we need this.
    // usually 13 to 19 bytes are missing
    // if the last page is smaller than this, the page header itself is truncated in the buffer.
    stream.reset(); // resetting the buffer to the position before we got the error
    LOG.info("completing the column chunk to read the page header");
    pageHeader = Util.readPageHeader(new SequenceInputStream(stream, f)); // trying again from the buffer + remainder of the stream.
  }
  return pageHeader;
}
 
Example #8
Source File: PredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
private static Optional<DictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName)
{
    try {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        PageHeader pageHeader = Util.readPageHeader(inputStream);

        if (pageHeader.type != PageType.DICTIONARY_PAGE) {
            return Optional.empty();
        }

        Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size());
        DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
        ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
        int dictionarySize = dicHeader.getNum_values();

        return Optional.of(new DictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
    }
    catch (IOException ignored) {
        return Optional.empty();
    }
}
 
Example #9
Source File: AsyncPageReader.java    From Bats with Apache License 2.0 6 votes vote down vote up
private DrillBuf decompress(PageHeader pageHeader, DrillBuf compressedData) {
  DrillBuf pageDataBuf = null;
  Stopwatch timer = Stopwatch.createUnstarted();
  long timeToRead;
  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();
  pageDataBuf = allocateTemporaryBuffer(uncompressedSize);
  try {
    timer.start();
    CompressionCodecName codecName = parentColumnReader.columnChunkMetaData.getCodec();
    ByteBuffer input = compressedData.nioBuffer(0, compressedSize);
    ByteBuffer output = pageDataBuf.nioBuffer(0, uncompressedSize);
    DecompressionHelper decompressionHelper = new DecompressionHelper(codecName);
    decompressionHelper.decompress(input, compressedSize, output, uncompressedSize);
    pageDataBuf.writerIndex(uncompressedSize);
    timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
    this.updateStats(pageHeader, "Decompress", 0, timeToRead, compressedSize, uncompressedSize);
  } catch (IOException e) {
    handleAndThrowException(e, "Error decompressing data.");
  }
  return pageDataBuf;
}
 
Example #10
Source File: PageReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void readDictionaryPage(final PageHeader pageHeader,
                                final ColumnReader<?> parentStatus) throws IOException {
  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();

  final ArrowBuf dictionaryData = allocateDictionaryBuffer(uncompressedSize);
  readPage(pageHeader, compressedSize, uncompressedSize, dictionaryData);
  DictionaryPage page = new DictionaryPage(
      asBytesInput(dictionaryData, 0, uncompressedSize),
      pageHeader.uncompressed_page_size,
      pageHeader.dictionary_page_header.num_values,
      valueOf(pageHeader.dictionary_page_header.encoding.name()));

  this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page);
}
 
Example #11
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private PageHeader newDataPageHeader(
  int uncompressedSize, int compressedSize,
  int valueCount,
  org.apache.parquet.column.Encoding rlEncoding,
  org.apache.parquet.column.Encoding dlEncoding,
  org.apache.parquet.column.Encoding valuesEncoding) {
  PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize);
  pageHeader.setData_page_header(new DataPageHeader(
    valueCount,
    getEncoding(valuesEncoding),
    getEncoding(dlEncoding),
    getEncoding(rlEncoding)));
  return pageHeader;
}
 
Example #12
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private PageHeader newDataPageV2Header(
    int uncompressedSize, int compressedSize,
    int valueCount, int nullCount, int rowCount,
    org.apache.parquet.column.Encoding dataEncoding,
    int rlByteLength, int dlByteLength) {
  // TODO: pageHeader.crc = ...;
  DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2(
      valueCount, nullCount, rowCount,
      getEncoding(dataEncoding),
      dlByteLength, rlByteLength);
  PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize);
  pageHeader.setData_page_header_v2(dataPageHeaderV2);
  return pageHeader;
}
 
Example #13
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void writeDictionaryPageHeader(
  int uncompressedSize, int compressedSize, int valueCount,
  org.apache.parquet.column.Encoding valuesEncoding, OutputStream to) throws IOException {
  PageHeader pageHeader = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize);
  pageHeader.setDictionary_page_header(new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding)));
  writePageHeader(pageHeader, to);
}
 
Example #14
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void writeDictionaryPageHeader(
    int uncompressedSize, int compressedSize, int valueCount,
    org.apache.parquet.column.Encoding valuesEncoding, int crc, OutputStream to) throws IOException {
  PageHeader pageHeader = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize);
  pageHeader.setCrc(crc);
  pageHeader.setDictionary_page_header(new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding)));
  writePageHeader(pageHeader, to);
}
 
Example #15
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<Long> getOffsets(TransParquetFileReader reader, ColumnChunkMetaData chunk) throws IOException {
  List<Long> offsets = new ArrayList<>();
  reader.setStreamPosition(chunk.getStartingPos());
  long readValues = 0;
  long totalChunkValues = chunk.getValueCount();
  while (readValues < totalChunkValues) {
    long curOffset = reader.getPos();
    PageHeader pageHeader = reader.readPageHeader();
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader);
        break;
      case DATA_PAGE:
        DataPageHeader headerV1 = pageHeader.data_page_header;
        offsets.add(curOffset);
        compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader);
        readValues += headerV1.getNum_values();
        break;
      case DATA_PAGE_V2:
        DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
        offsets.add(curOffset);
        int rlLength = headerV2.getRepetition_levels_byte_length();
        compressionConverter.readBlock(rlLength, reader);
        int dlLength = headerV2.getDefinition_levels_byte_length();
        compressionConverter.readBlock(dlLength, reader);
        int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
        compressionConverter.readBlock(payLoadLength, reader);
        readValues += headerV2.getNum_values();
        break;
      default:
        throw new IOException("Not recognized page type");
    }
  }
  return offsets;
}
 
Example #16
Source File: PageReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void updateStats(PageHeader pageHeader, String op, long start, long time, long bytesin, long bytesout) {
  String pageType = "Data Page";
  if (pageHeader.type == PageType.DICTIONARY_PAGE) {
    pageType = "Dictionary Page";
  }
  logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}", op, pageType.toString(),
      this.parentColumnReader.parentReader.fsPath,
      this.parentColumnReader.columnDescriptor.toString(), start, bytesin, bytesout, time);
  if (pageHeader.type != PageType.DICTIONARY_PAGE) {
    if (bytesin == bytesout) {
      this.stats.timePageLoads += time;
      this.stats.numPageLoads++;
      this.stats.totalPageReadBytes += bytesin;
    } else {
      this.stats.timePagesDecompressed += time;
      this.stats.numPagesDecompressed++;
      this.stats.totalDecompressedBytes += bytesin;
    }
  } else {
    if (bytesin == bytesout) {
      this.stats.timeDictPageLoads += time;
      this.stats.numDictPageLoads++;
      this.stats.totalDictPageReadBytes += bytesin;
    } else {
      this.stats.timeDictPagesDecompressed += time;
      this.stats.numDictPagesDecompressed++;
      this.stats.totalDictDecompressedBytes += bytesin;
    }
  }
}
 
Example #17
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testPageHeader() throws IOException {
  ByteArrayOutputStream out = new ByteArrayOutputStream();
  PageType type = PageType.DATA_PAGE;
  int compSize = 10;
  int uncSize = 20;
  PageHeader pageHeader = new PageHeader(type, uncSize, compSize);
  writePageHeader(pageHeader, out);
  PageHeader readPageHeader = readPageHeader(new ByteArrayInputStream(out.toByteArray()));
  assertEquals(pageHeader, readPageHeader);
}
 
Example #18
Source File: PageReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public void readPage(PageHeader pageHeader, int compressedSize, int uncompressedSize, ArrowBuf dest) throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  long timeToRead;
  long start = inputStream.getPos();
  if (parentColumnReader.columnChunkMetaData.getCodec() == CompressionCodecName.UNCOMPRESSED) {
    timer.start();
    dataReader.loadPage(dest, compressedSize);
    timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
    this.updateStats(pageHeader, "Page Read", start, timeToRead, compressedSize, uncompressedSize);
  } else {
    final ArrowBuf compressedData = allocateTemporaryBuffer(compressedSize);
    try {
      timer.start();
      dataReader.loadPage(compressedData, compressedSize);
      timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
      timer.reset();
      this.updateStats(pageHeader, "Page Read", start, timeToRead, compressedSize, compressedSize);
      start = inputStream.getPos();
      timer.start();
      codecFactory.getDecompressor(parentColumnReader.columnChunkMetaData
        .getCodec()).decompress(compressedData.nioBuffer(0, compressedSize), compressedSize,
        dest.nioBuffer(0, uncompressedSize), uncompressedSize);
      timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
      this.updateStats(pageHeader, "Decompress", start, timeToRead, compressedSize, uncompressedSize);
    } finally {
      compressedData.release();
    }
  }
}
 
Example #19
Source File: ColumnChunkIncReadStore.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private DictionaryPage readDictionaryPageHelper(PageHeader pageHeader) throws IOException {
  ByteBuffer data = uncompressPage(pageHeader, false);
  return new DictionaryPage(
      BytesInput.from(data, 0, pageHeader.uncompressed_page_size),
      pageHeader.getDictionary_page_header().getNum_values(),
      parquetMetadataConverter.getEncoding(pageHeader.dictionary_page_header.encoding)
  );
}
 
Example #20
Source File: ColumnChunkIncReadStore.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Override
public DictionaryPage readDictionaryPage() {
  if (dictionaryPage == null) {
    PageHeader pageHeader = new PageHeader();
    long pos = 0;
    try {
      pos = in.getPos();
      pageHeader = Util.readPageHeader(in.asSeekableInputStream());
      if (pageHeader.getDictionary_page_header() == null) {
        in.seek(pos);
        return null;
      }
      dictionaryPage = readDictionaryPageHelper(pageHeader);
    } catch (Exception e) {
      throw new RuntimeException("Error reading dictionary page." +
        "\nFile path: " + path.toURI().getPath() +
        "\nRow count: " + rowCount +
        "\nColumn Chunk Metadata: " + metaData +
        "\nPage Header: " + pageHeader +
        "\nFile offset: " + fileOffset +
        "\nSize: " + size +
        "\nValue read so far: " + valueReadSoFar +
        "\nPosition: " + pos, e);
    }
  }
  return dictionaryPage;
}
 
Example #21
Source File: ParquetColumnChunk.java    From presto with Apache License 2.0 5 votes vote down vote up
private DictionaryPage readDictionaryPage(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize)
{
    DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
    return new DictionaryPage(
            getSlice(compressedPageSize),
            uncompressedPageSize,
            dicHeader.getNum_values(),
            getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name())));
}
 
Example #22
Source File: ParquetColumnChunk.java    From presto with Apache License 2.0 5 votes vote down vote up
public PageReader readAllPages()
        throws IOException
{
    List<DataPage> pages = new ArrayList<>();
    DictionaryPage dictionaryPage = null;
    long valueCount = 0;
    while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) {
        PageHeader pageHeader = readPageHeader();
        int uncompressedPageSize = pageHeader.getUncompressed_page_size();
        int compressedPageSize = pageHeader.getCompressed_page_size();
        switch (pageHeader.type) {
            case DICTIONARY_PAGE:
                if (dictionaryPage != null) {
                    throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor());
                }
                dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize);
                break;
            case DATA_PAGE:
                valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages);
                break;
            case DATA_PAGE_V2:
                valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages);
                break;
            default:
                input.skip(compressedPageSize);
                break;
        }
    }
    return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage);
}
 
Example #23
Source File: PageReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
protected void updateStats(PageHeader pageHeader, String op, long start, long time, long bytesin, long bytesout) {
  String pageType = "Data Page";
  if (pageHeader.type == PageType.DICTIONARY_PAGE) {
    pageType = "Dictionary Page";
  }
  logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}", op, pageType,
      this.parentColumnReader.parentReader.hadoopPath,
      this.parentColumnReader.columnDescriptor.toString(), start, bytesin, bytesout, time);

  if (pageHeader.type != PageType.DICTIONARY_PAGE) {
    if (bytesin == bytesout) {
      this.stats.timeDataPageLoads.addAndGet(time);
      this.stats.numDataPageLoads.incrementAndGet();
      this.stats.totalDataPageReadBytes.addAndGet(bytesin);
    } else {
      this.stats.timeDataPagesDecompressed.addAndGet(time);
      this.stats.numDataPagesDecompressed.incrementAndGet();
      this.stats.totalDataDecompressedBytes.addAndGet(bytesin);
    }
  } else {
    if (bytesin == bytesout) {
      this.stats.timeDictPageLoads.addAndGet(time);
      this.stats.numDictPageLoads.incrementAndGet();
      this.stats.totalDictPageReadBytes.addAndGet(bytesin);
    } else {
      this.stats.timeDictPagesDecompressed.addAndGet(time);
      this.stats.numDictPagesDecompressed.incrementAndGet();
      this.stats.totalDictDecompressedBytes.addAndGet(bytesin);
    }
  }
}
 
Example #24
Source File: ColumnChunkIncReadStore.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private ByteBuffer uncompressPage(PageHeader pageHeader, boolean isDataPage) throws IOException {
  final int compressedPageSize = pageHeader.compressed_page_size;
  final int uncompressedPageSize = pageHeader.uncompressed_page_size;
  final ByteBuf src = allocator.buffer(compressedPageSize).asNettyBuffer();
  ByteBuf dest = null;
  try {
    readFully(src, compressedPageSize);
    dest = allocator.buffer(uncompressedPageSize).asNettyBuffer();
    ByteBuffer destBuffer = dest.nioBuffer(0, uncompressedPageSize);

    switch (pageHeader.type) {
      /**
       * Page structure :
       * [RepetitionLevelBytes][DefinitionLevelBytes][DataBytes]
       * Only the data bytes are compressed.
       */
      case DATA_PAGE_V2:
        final int dataOffset = pageHeader.getData_page_header_v2().getRepetition_levels_byte_length() +
          pageHeader.getData_page_header_v2().getDefinition_levels_byte_length();
        final int compressedDataSize = compressedPageSize - dataOffset;
        //Copy the repetition levels and definition levels as it is.
        if (dataOffset > 0) {
          final ByteBuffer rlDlBuffer = src.nioBuffer(0, dataOffset);
          destBuffer.put(rlDlBuffer);
        }
        // decompress the data part
        if (compressedDataSize > 0) {
          final int uncompressedDataSize = uncompressedPageSize - dataOffset;
          final ByteBuffer srcDataBuf = src.nioBuffer(dataOffset, compressedDataSize);
          final ByteBuffer destDataBuf = dest.nioBuffer(dataOffset, uncompressedDataSize);
          // important to add the starting position to the sizes so that
          // the decompresser sets limits correctly.
          decompressor.decompress(srcDataBuf, compressedDataSize,
            destDataBuf, uncompressedDataSize);
        }
        break;
      default:
        ByteBuffer srcBuffer = src.nioBuffer(0, compressedPageSize);
        decompressor.decompress(srcBuffer, compressedPageSize, destBuffer, uncompressedPageSize);
    }
    if (isDataPage) {
      lastDataPageUncompressed = dest;
    } else {
      dictionaryPageUncompressed = dest;
    }
    return destBuffer;
  } catch (IOException e) {
    if (dest != null) {
      dest.release();
    }
    throw e;
  } finally {
    src.release(); // we don't need this anymore
  }
}
 
Example #25
Source File: ColumnDataReader.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
public PageHeader readPageHeader() throws IOException{
  return Util.readPageHeader(input);
}
 
Example #26
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
protected PageHeader readPageHeader() throws IOException {
  return Util.readPageHeader(stream);
}
 
Example #27
Source File: CompressionConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void processChunk(TransParquetFileReader reader, ParquetFileWriter writer, ColumnChunkMetaData chunk,
                          String createdBy, CompressionCodecName codecName) throws IOException {
  CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0);
  CompressionCodecFactory.BytesInputDecompressor decompressor = codecFactory.getDecompressor(chunk.getCodec());
  CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(codecName);
  ColumnIndex columnIndex = reader.readColumnIndex(chunk);
  OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);

  reader.setStreamPosition(chunk.getStartingPos());
  DictionaryPage dictionaryPage = null;
  long readValues = 0;
  Statistics statistics = null;
  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  int pageIndex = 0;
  long totalChunkValues = chunk.getValueCount();
  while (readValues < totalChunkValues) {
    PageHeader pageHeader = reader.readPageHeader();
    int compressedPageSize = pageHeader.getCompressed_page_size();
    byte[] pageLoad;
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        if (dictionaryPage != null) {
          throw new IOException("has more than one dictionary page in column chunk");
        }
        DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
        pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
        writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad),
                                                 pageHeader.getUncompressed_page_size(),
                                                 dictPageHeader.getNum_values(),
                                                 converter.getEncoding(dictPageHeader.getEncoding())));
        break;
      case DATA_PAGE:
        DataPageHeader headerV1 = pageHeader.data_page_header;
        pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
        statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV1.getStatistics(), columnIndex, pageIndex, converter);
        readValues += headerV1.getNum_values();
        if (offsetIndex != null) {
          long rowCount = 1 + offsetIndex.getLastRowIndex(pageIndex, totalChunkValues) - offsetIndex.getFirstRowIndex(pageIndex);
          writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()),
            pageHeader.getUncompressed_page_size(),
            BytesInput.from(pageLoad),
            statistics,
            toIntWithCheck(rowCount),
            converter.getEncoding(headerV1.getRepetition_level_encoding()),
            converter.getEncoding(headerV1.getDefinition_level_encoding()),
            converter.getEncoding(headerV1.getEncoding()));
        } else {
          writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()),
            pageHeader.getUncompressed_page_size(),
            BytesInput.from(pageLoad),
            statistics,
            converter.getEncoding(headerV1.getRepetition_level_encoding()),
            converter.getEncoding(headerV1.getDefinition_level_encoding()),
            converter.getEncoding(headerV1.getEncoding()));
        }
        pageIndex++;
        break;
      case DATA_PAGE_V2:
        DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
        int rlLength = headerV2.getRepetition_levels_byte_length();
        BytesInput rlLevels = readBlockAllocate(rlLength, reader);
        int dlLength = headerV2.getDefinition_levels_byte_length();
        BytesInput dlLevels = readBlockAllocate(dlLength, reader);
        int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
        int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
        pageLoad = translatePageLoad(reader, headerV2.is_compressed, compressor, decompressor, payLoadLength, rawDataLength);
        statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV2.getStatistics(), columnIndex, pageIndex, converter);
        readValues += headerV2.getNum_values();
        writer.writeDataPageV2(headerV2.getNum_rows(),
          headerV2.getNum_nulls(),
          headerV2.getNum_values(),
          rlLevels,
          dlLevels,
          converter.getEncoding(headerV2.getEncoding()),
          BytesInput.from(pageLoad),
          rawDataLength,
          statistics);
        pageIndex++;
        break;
      default:
        LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
        break;
    }
  }
}
 
Example #28
Source File: CompressionConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public PageHeader readPageHeader() throws IOException {
  return Util.readPageHeader(f);
}
 
Example #29
Source File: ParquetColumnChunk.java    From presto with Apache License 2.0 4 votes vote down vote up
protected PageHeader readPageHeader()
        throws IOException
{
    return Util.readPageHeader(input);
}
 
Example #30
Source File: ColumnDataReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
public PageHeader readPageHeader() throws IOException{
  return Util.readPageHeader(input);
}