org.apache.parquet.column.ValuesType Java Exaples

Source File: BasePageIterator.java From iceberg with Apache License 2.0

6 votes

protected void initFromPage(DataPageV1 initPage) {
  this.triplesCount = initPage.getValueCount();
  ValuesReader rlReader = initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  try {
    BytesInput bytes = initPage.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    initDefinitionLevelsReader(initPage, desc, in, triplesCount);
    LOG.debug("reading data at {}", in.position());
    initDataReader(initPage.getValueEncoding(), in, initPage.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + initPage + " in col " + desc, e);
  }
}

Source File: PageReader.java From Bats with Apache License 2.0

5 votes

/**
 * Enables Parquet column readers to reset the definition level reader to a specific state.
 * @param skipCount the number of rows to skip (optional)
 *
 * @throws IOException An IO related condition
 */
void resetDefinitionLevelReader(int skipCount) throws IOException {
  Preconditions.checkState(parentColumnReader.columnDescriptor.getMaxDefinitionLevel() == 1);
  Preconditions.checkState(currentPageCount > 0);

  final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
  final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);

  final ByteBufferInputStream in = ByteBufferInputStream.wrap(pageData.nioBuffer(0, pageData.capacity()));

  if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
    repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
    repetitionLevels.initFromPage(currentPageCount, in);
    repetitionLevels.readInteger();
  }

  definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
  parentColumnReader.currDefLevel = -1;

  // Now reinitialize the underlying decoder
  definitionLevels.initFromPage(currentPageCount, in);

  // Skip values if requested by caller
  for (int idx = 0; idx < skipCount; ++idx) {
    definitionLevels.skip();
  }
}

Source File: PageIterator.java From iceberg with Apache License 2.0

5 votes

@Override
  protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
    ValuesReader previousReader = values;

    this.valueEncoding = dataEncoding;

    // TODO: May want to change this so that this class is not dictionary-aware.
    // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries
    // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader
    if (dataEncoding.usesDictionary()) {
      if (dictionary == null) {
        throw new ParquetDecodingException(
            "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
      }
      this.values = dataEncoding.getDictionaryBasedValuesReader(desc, ValuesType.VALUES, dictionary);
    } else {
      this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES);
    }

//    if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
//      bindToDictionary(dictionary);
//    } else {
//      bind(path.getType());
//    }

    try {
      values.initFromPage(valueCount, in);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }

    if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
        previousReader instanceof RequiresPreviousReader) {
      // previous reader can only be set if reading sequentially
      ((RequiresPreviousReader) values).setPreviousReader(previousReader);
    }
  }

Source File: PageIterator.java From iceberg with Apache License 2.0

5 votes

@Override
protected void initDefinitionLevelsReader(DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in,
                                          int triplesCount) throws IOException {
  ValuesReader dlReader = dataPageV1.getDlEncoding().getValuesReader(desc, ValuesType.DEFINITION_LEVEL);
  this.definitionLevels = new ValuesReaderIntIterator(dlReader);
  dlReader.initFromPage(triplesCount, in);
}

Source File: PageReader.java From Bats with Apache License 2.0

4 votes

/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws IOException
 */
public boolean next() throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  currentPageCount = -1;
  valuesRead = 0;
  valuesReadyToRead = 0;

  // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
  // and submit a bug report
  long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
  if(parentColumnReader.totalValuesRead >= totalValueCount) {
    return false;
  }
  clearBuffers();

  nextInternal();
  if(pageData == null || pageHeader == null){
    //TODO: Is this an error condition or a normal condition??
    return false;
  }

  timer.start();
  currentPageCount = pageHeader.data_page_header.num_values;

  final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
  final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
  final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);

  byteLength = pageHeader.uncompressed_page_size;

  final ByteBufferInputStream in = ByteBufferInputStream.wrap(pageData.nioBuffer(0, pageData.capacity()));

  readPosInBytes = 0;
  if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
    repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
    repetitionLevels.initFromPage(currentPageCount, in);
    // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
    // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
    // read the first zero here to simplify the reading processes, and start reading the first value the same as all
    // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
    // the first list of repetition levels
    readPosInBytes = in.position();
    repetitionLevels.readInteger();
  }
  if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
    parentColumnReader.currDefLevel = -1;
    definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
    definitionLevels.initFromPage(currentPageCount, in);
    readPosInBytes = in.position();
    if (!valueEncoding.usesDictionary()) {
      valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
      valueReader.initFromPage(currentPageCount, in);
    }
  }
  if (valueReader == null && parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
    valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
    valueReader.initFromPage(currentPageCount, in);
  }
  if (valueEncoding.usesDictionary()) {
    // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
    // actually copying the values out into the vectors
    Preconditions.checkState(readPosInBytes < pageData.capacity());
    int index = (int)readPosInBytes;
    ByteBuffer byteBuffer = pageData.nioBuffer(index, pageData.capacity() - index);
    dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
    dictionaryLengthDeterminingReader.initFromPage(currentPageCount, ByteBufferInputStream.wrap(byteBuffer));
    dictionaryValueReader = new DictionaryValuesReader(dictionary);
    dictionaryValueReader.initFromPage(currentPageCount, ByteBufferInputStream.wrap(byteBuffer));
    parentColumnReader.usingDictionary = true;
  } else {
    parentColumnReader.usingDictionary = false;
  }
  // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
  // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
  // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
  // definition and repetition level data which is stored alongside the page data itself
  readyToReadPosInBytes = readPosInBytes;
  long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
  stats.numDataPagesDecoded.incrementAndGet();
  stats.timeDataPageDecode.addAndGet(timeDecode);
  return true;
}

Source File: PageReader.java From dremio-oss with Apache License 2.0

4 votes

/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws java.io.IOException
 */
public boolean next() throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  currentPageCount = -1;
  valuesRead = 0;
  valuesReadyToRead = 0;

  // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
  // and submit a bug report
  if(!dataReader.hasRemainder() || parentColumnReader.totalValuesRead == parentColumnReader.columnChunkMetaData.getValueCount()) {
    return false;
  }
  clearBuffers();

  // next, we need to decompress the bytes
  // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one
  // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary
  do {
    long start=inputStream.getPos();
    timer.start();
    pageHeader = dataReader.readPageHeader();
    long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
    this.updateStats(pageHeader, "Page Header Read", start, timeToRead, 0,0);
    logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","",
        this.parentColumnReader.parentReader.fsPath,
        this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead);
    timer.reset();
    if (pageHeader.getType() == PageType.DICTIONARY_PAGE) {
      readDictionaryPage(pageHeader, parentColumnReader);
    }
  } while (pageHeader.getType() == PageType.DICTIONARY_PAGE);

  //TODO: Handle buffer allocation exception

  allocatePageData(pageHeader.getUncompressed_page_size());
  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();
  readPage(pageHeader, compressedSize, uncompressedSize, pageData);

  currentPageCount = pageHeader.data_page_header.num_values;

  final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);

  final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
  final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);

  byteLength = pageHeader.uncompressed_page_size;

  final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, LargeMemoryUtil.checkedCastToInt(pageData.capacity()));

  readPosInBytes = 0;
  if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
    repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
    repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
    // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
    // read the first zero here to simplify the reading processes, and start reading the first value the same as all
    // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
    // the first list of repetition levels
    readPosInBytes = repetitionLevels.getNextOffset();
    repetitionLevels.readInteger();
  }
  if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0){
    parentColumnReader.currDefLevel = -1;
    definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
    definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    readPosInBytes = definitionLevels.getNextOffset();
    if (!valueEncoding.usesDictionary()) {
      valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
      valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    }
  }
  if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
    valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
    valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
  }
  if (valueEncoding.usesDictionary()) {
    // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
    // actually copying the values out into the vectors
    dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
    dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    dictionaryValueReader = new DictionaryValuesReader(dictionary);
    dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    parentColumnReader.usingDictionary = true;
  } else {
    parentColumnReader.usingDictionary = false;
  }
  // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
  // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
  // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
  // definition and repetition level data which is stored alongside the page data itself
  readyToReadPosInBytes = readPosInBytes;
  return true;
}

org.apache.parquet.column.ValuesType Java Examples