org.apache.parquet.column.ValuesType Java Examples
The following examples show how to use
org.apache.parquet.column.ValuesType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BasePageIterator.java From iceberg with Apache License 2.0 | 6 votes |
protected void initFromPage(DataPageV1 initPage) { this.triplesCount = initPage.getValueCount(); ValuesReader rlReader = initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL); this.repetitionLevels = new ValuesReaderIntIterator(rlReader); try { BytesInput bytes = initPage.getBytes(); LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount); LOG.debug("reading repetition levels at 0"); ByteBufferInputStream in = bytes.toInputStream(); rlReader.initFromPage(triplesCount, in); LOG.debug("reading definition levels at {}", in.position()); initDefinitionLevelsReader(initPage, desc, in, triplesCount); LOG.debug("reading data at {}", in.position()); initDataReader(initPage.getValueEncoding(), in, initPage.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + initPage + " in col " + desc, e); } }
Example #2
Source File: PageReader.java From Bats with Apache License 2.0 | 5 votes |
/** * Enables Parquet column readers to reset the definition level reader to a specific state. * @param skipCount the number of rows to skip (optional) * * @throws IOException An IO related condition */ void resetDefinitionLevelReader(int skipCount) throws IOException { Preconditions.checkState(parentColumnReader.columnDescriptor.getMaxDefinitionLevel() == 1); Preconditions.checkState(currentPageCount > 0); final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding); final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding); final ByteBufferInputStream in = ByteBufferInputStream.wrap(pageData.nioBuffer(0, pageData.capacity())); if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) { repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL); repetitionLevels.initFromPage(currentPageCount, in); repetitionLevels.readInteger(); } definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL); parentColumnReader.currDefLevel = -1; // Now reinitialize the underlying decoder definitionLevels.initFromPage(currentPageCount, in); // Skip values if requested by caller for (int idx = 0; idx < skipCount; ++idx) { definitionLevels.skip(); } }
Example #3
Source File: PageIterator.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) { ValuesReader previousReader = values; this.valueEncoding = dataEncoding; // TODO: May want to change this so that this class is not dictionary-aware. // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException( "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding); } this.values = dataEncoding.getDictionaryBasedValuesReader(desc, ValuesType.VALUES, dictionary); } else { this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES); } // if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { // bindToDictionary(dictionary); // } else { // bind(path.getType()); // } try { values.initFromPage(valueCount, in); } catch (IOException e) { throw new ParquetDecodingException("could not read page in col " + desc, e); } if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially ((RequiresPreviousReader) values).setPreviousReader(previousReader); } }
Example #4
Source File: PageIterator.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected void initDefinitionLevelsReader(DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in, int triplesCount) throws IOException { ValuesReader dlReader = dataPageV1.getDlEncoding().getValuesReader(desc, ValuesType.DEFINITION_LEVEL); this.definitionLevels = new ValuesReaderIntIterator(dlReader); dlReader.initFromPage(triplesCount, in); }
Example #5
Source File: PageReader.java From Bats with Apache License 2.0 | 4 votes |
/** * Grab the next page. * * @return - if another page was present * @throws IOException */ public boolean next() throws IOException { Stopwatch timer = Stopwatch.createUnstarted(); currentPageCount = -1; valuesRead = 0; valuesReadyToRead = 0; // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause // and submit a bug report long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount(); if(parentColumnReader.totalValuesRead >= totalValueCount) { return false; } clearBuffers(); nextInternal(); if(pageData == null || pageHeader == null){ //TODO: Is this an error condition or a normal condition?? return false; } timer.start(); currentPageCount = pageHeader.data_page_header.num_values; final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding); final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding); final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding); byteLength = pageHeader.uncompressed_page_size; final ByteBufferInputStream in = ByteBufferInputStream.wrap(pageData.nioBuffer(0, pageData.capacity())); readPosInBytes = 0; if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) { repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL); repetitionLevels.initFromPage(currentPageCount, in); // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we // read the first zero here to simplify the reading processes, and start reading the first value the same as all // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to // the first list of repetition levels readPosInBytes = in.position(); repetitionLevels.readInteger(); } if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) { parentColumnReader.currDefLevel = -1; definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL); definitionLevels.initFromPage(currentPageCount, in); readPosInBytes = in.position(); if (!valueEncoding.usesDictionary()) { valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES); valueReader.initFromPage(currentPageCount, in); } } if (valueReader == null && parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) { valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES); valueReader.initFromPage(currentPageCount, in); } if (valueEncoding.usesDictionary()) { // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for // actually copying the values out into the vectors Preconditions.checkState(readPosInBytes < pageData.capacity()); int index = (int)readPosInBytes; ByteBuffer byteBuffer = pageData.nioBuffer(index, pageData.capacity() - index); dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary); dictionaryLengthDeterminingReader.initFromPage(currentPageCount, ByteBufferInputStream.wrap(byteBuffer)); dictionaryValueReader = new DictionaryValuesReader(dictionary); dictionaryValueReader.initFromPage(currentPageCount, ByteBufferInputStream.wrap(byteBuffer)); parentColumnReader.usingDictionary = true; } else { parentColumnReader.usingDictionary = false; } // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will // fit one record at a time, such as for variable length data. Both operations must start in the same location after the // definition and repetition level data which is stored alongside the page data itself readyToReadPosInBytes = readPosInBytes; long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS); stats.numDataPagesDecoded.incrementAndGet(); stats.timeDataPageDecode.addAndGet(timeDecode); return true; }
Example #6
Source File: PageReader.java From dremio-oss with Apache License 2.0 | 4 votes |
/** * Grab the next page. * * @return - if another page was present * @throws java.io.IOException */ public boolean next() throws IOException { Stopwatch timer = Stopwatch.createUnstarted(); currentPageCount = -1; valuesRead = 0; valuesReadyToRead = 0; // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause // and submit a bug report if(!dataReader.hasRemainder() || parentColumnReader.totalValuesRead == parentColumnReader.columnChunkMetaData.getValueCount()) { return false; } clearBuffers(); // next, we need to decompress the bytes // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary do { long start=inputStream.getPos(); timer.start(); pageHeader = dataReader.readPageHeader(); long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); this.updateStats(pageHeader, "Page Header Read", start, timeToRead, 0,0); logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","", this.parentColumnReader.parentReader.fsPath, this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead); timer.reset(); if (pageHeader.getType() == PageType.DICTIONARY_PAGE) { readDictionaryPage(pageHeader, parentColumnReader); } } while (pageHeader.getType() == PageType.DICTIONARY_PAGE); //TODO: Handle buffer allocation exception allocatePageData(pageHeader.getUncompressed_page_size()); int compressedSize = pageHeader.getCompressed_page_size(); int uncompressedSize = pageHeader.getUncompressed_page_size(); readPage(pageHeader, compressedSize, uncompressedSize, pageData); currentPageCount = pageHeader.data_page_header.num_values; final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding); final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding); final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding); byteLength = pageHeader.uncompressed_page_size; final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, LargeMemoryUtil.checkedCastToInt(pageData.capacity())); readPosInBytes = 0; if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) { repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL); repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we // read the first zero here to simplify the reading processes, and start reading the first value the same as all // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to // the first list of repetition levels readPosInBytes = repetitionLevels.getNextOffset(); repetitionLevels.readInteger(); } if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0){ parentColumnReader.currDefLevel = -1; definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL); definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); readPosInBytes = definitionLevels.getNextOffset(); if (!valueEncoding.usesDictionary()) { valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES); valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); } } if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) { valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES); valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); } if (valueEncoding.usesDictionary()) { // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for // actually copying the values out into the vectors dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary); dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); dictionaryValueReader = new DictionaryValuesReader(dictionary); dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); parentColumnReader.usingDictionary = true; } else { parentColumnReader.usingDictionary = false; } // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will // fit one record at a time, such as for variable length data. Both operations must start in the same location after the // definition and repetition level data which is stored alongside the page data itself readyToReadPosInBytes = readPosInBytes; return true; }