Java Code Examples for org.apache.parquet.column.Encoding#usesDictionary()

The following examples show how to use org.apache.parquet.column.Encoding#usesDictionary() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: VectorizedPageIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
  ValuesReader previousReader = plainValuesReader;
  if (dataEncoding.usesDictionary()) {
    if (dictionary == null) {
      throw new ParquetDecodingException(
          "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
    }
    try {
      dictionaryEncodedValuesReader =
          new VectorizedDictionaryEncodedParquetValuesReader(desc.getMaxDefinitionLevel(), setArrowValidityVector);
      dictionaryEncodedValuesReader.initFromPage(valueCount, in);
      if (ParquetUtil.isIntType(desc.getPrimitiveType()) || !allPagesDictEncoded) {
        dictionaryDecodeMode = DictionaryDecodeMode.EAGER;
      } else {
        dictionaryDecodeMode = DictionaryDecodeMode.LAZY;
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }
  } else {
    plainValuesReader = new ValuesAsBytesReader();
    plainValuesReader.initFromPage(valueCount, in);
    dictionaryDecodeMode = DictionaryDecodeMode.NONE;
  }
  if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
      previousReader != null && previousReader instanceof RequiresPreviousReader) {
    // previous reader can only be set if reading sequentially
    ((RequiresPreviousReader) plainValuesReader).setPreviousReader(previousReader);
  }
}
 
Example 2
Source File: PageIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
  protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
    ValuesReader previousReader = values;

    this.valueEncoding = dataEncoding;

    // TODO: May want to change this so that this class is not dictionary-aware.
    // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries
    // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader
    if (dataEncoding.usesDictionary()) {
      if (dictionary == null) {
        throw new ParquetDecodingException(
            "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
      }
      this.values = dataEncoding.getDictionaryBasedValuesReader(desc, ValuesType.VALUES, dictionary);
    } else {
      this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES);
    }

//    if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
//      bindToDictionary(dictionary);
//    } else {
//      bind(path.getType());
//    }

    try {
      values.initFromPage(valueCount, in);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }

    if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
        previousReader instanceof RequiresPreviousReader) {
      // previous reader can only be set if reading sequentially
      ((RequiresPreviousReader) values).setPreviousReader(previousReader);
    }
  }
 
Example 3
Source File: PageIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
    ValuesReader previousReader = values;

    this.valueEncoding = dataEncoding;

    // TODO: May want to change this so that this class is not dictionary-aware.
    // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries
    // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader
    if (dataEncoding.usesDictionary()) {
      if (dict == null) {
        throw new ParquetDecodingException(
            "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
      }
      this.values = dataEncoding.getDictionaryBasedValuesReader(desc, VALUES, dict);
    } else {
      this.values = dataEncoding.getValuesReader(desc, VALUES);
    }

//    if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
//      bindToDictionary(dictionary);
//    } else {
//      bind(path.getType());
//    }

    try {
      values.initFromPage(valueCount, in);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }

    if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
        previousReader != null && previousReader instanceof RequiresPreviousReader) {
      // previous reader can only be set if reading sequentially
      ((RequiresPreviousReader) values).setPreviousReader(previousReader);
    }
  }
 
Example 4
Source File: PageReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private boolean isDictionaryEncoded(Collection<Encoding> encodings) {
  for (Encoding encoding : encodings) {
    if (encoding.usesDictionary()) {
      return true;
    }
  }
  return false;
}
 
Example 5
Source File: AbstractColumnReader.java    From flink with Apache License 2.0 5 votes vote down vote up
private void prepareNewPage(
		Encoding dataEncoding,
		ByteBufferInputStream in) throws IOException {
	this.endOfPageValueCount = valuesRead + pageValueCount;
	if (dataEncoding.usesDictionary()) {
		if (dictionary == null) {
			throw new IOException(
					"could not read page in col " + descriptor +
							" as the dictionary was missing for encoding " + dataEncoding);
		}
		@SuppressWarnings("deprecation")
		Encoding plainDict = Encoding.PLAIN_DICTIONARY; // var to allow warning suppression
		if (dataEncoding != plainDict && dataEncoding != Encoding.RLE_DICTIONARY) {
			throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding);
		}
		this.dataInputStream = null;
		this.dictionaryIdsDecoder = new RunLengthDecoder();
		try {
			this.dictionaryIdsDecoder.initFromStream(pageValueCount, in);
		} catch (IOException e) {
			throw new IOException("could not read dictionary in col " + descriptor, e);
		}
		this.isCurrentPageDictionaryEncoded = true;
	} else {
		if (dataEncoding != Encoding.PLAIN) {
			throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding);
		}
		this.dictionaryIdsDecoder = null;
		LOG.debug("init from page at offset {} for length {}", in.position(), in.available());
		this.dataInputStream = in.remainingStream();
		this.isCurrentPageDictionaryEncoded = false;
	}

	afterReadPage();
}
 
Example 6
Source File: FallbackValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Encoding getEncoding() {
  Encoding encoding = currentWriter.getEncoding();
  if (!fellBackAlready && !initialUsedAndHadDictionary) {
    initialUsedAndHadDictionary = encoding.usesDictionary();
  }
  return encoding;
}
 
Example 7
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
  ValuesReader previousReader = this.dataColumn;

  this.currentEncoding = dataEncoding;
  this.pageValueCount = valueCount;
  this.endOfPageValueCount = readValues + pageValueCount;

  if (dataEncoding.usesDictionary()) {
    if (dictionary == null) {
      throw new ParquetDecodingException(
          "could not read page in col " + path + " as the dictionary was missing for encoding " + dataEncoding);
    }
    this.dataColumn = dataEncoding.getDictionaryBasedValuesReader(path, VALUES, dictionary);
  } else {
    this.dataColumn = dataEncoding.getValuesReader(path, VALUES);
  }

  if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
    bindToDictionary(dictionary);
  } else {
    bind(path.getType());
  }

  try {
    dataColumn.initFromPage(pageValueCount, in);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page in col " + path, e);
  }

  if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
      previousReader != null && previousReader instanceof RequiresPreviousReader) {
    // previous reader can only be set if reading sequentially
    ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader);
  }
}
 
Example 8
Source File: PageReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws IOException
 */
public boolean next() throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  currentPageCount = -1;
  valuesRead = 0;
  valuesReadyToRead = 0;

  // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
  // and submit a bug report
  long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
  if(parentColumnReader.totalValuesRead >= totalValueCount) {
    return false;
  }
  clearBuffers();

  nextInternal();
  if(pageData == null || pageHeader == null){
    //TODO: Is this an error condition or a normal condition??
    return false;
  }

  timer.start();
  currentPageCount = pageHeader.data_page_header.num_values;

  final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
  final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
  final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);

  byteLength = pageHeader.uncompressed_page_size;

  final ByteBufferInputStream in = ByteBufferInputStream.wrap(pageData.nioBuffer(0, pageData.capacity()));

  readPosInBytes = 0;
  if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
    repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
    repetitionLevels.initFromPage(currentPageCount, in);
    // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
    // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
    // read the first zero here to simplify the reading processes, and start reading the first value the same as all
    // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
    // the first list of repetition levels
    readPosInBytes = in.position();
    repetitionLevels.readInteger();
  }
  if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
    parentColumnReader.currDefLevel = -1;
    definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
    definitionLevels.initFromPage(currentPageCount, in);
    readPosInBytes = in.position();
    if (!valueEncoding.usesDictionary()) {
      valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
      valueReader.initFromPage(currentPageCount, in);
    }
  }
  if (valueReader == null && parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
    valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
    valueReader.initFromPage(currentPageCount, in);
  }
  if (valueEncoding.usesDictionary()) {
    // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
    // actually copying the values out into the vectors
    Preconditions.checkState(readPosInBytes < pageData.capacity());
    int index = (int)readPosInBytes;
    ByteBuffer byteBuffer = pageData.nioBuffer(index, pageData.capacity() - index);
    dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
    dictionaryLengthDeterminingReader.initFromPage(currentPageCount, ByteBufferInputStream.wrap(byteBuffer));
    dictionaryValueReader = new DictionaryValuesReader(dictionary);
    dictionaryValueReader.initFromPage(currentPageCount, ByteBufferInputStream.wrap(byteBuffer));
    parentColumnReader.usingDictionary = true;
  } else {
    parentColumnReader.usingDictionary = false;
  }
  // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
  // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
  // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
  // definition and repetition level data which is stored alongside the page data itself
  readyToReadPosInBytes = readPosInBytes;
  long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
  stats.numDataPagesDecoded.incrementAndGet();
  stats.timeDataPageDecode.addAndGet(timeDecode);
  return true;
}
 
Example 9
Source File: PageReader.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws java.io.IOException
 */
public boolean next() throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  currentPageCount = -1;
  valuesRead = 0;
  valuesReadyToRead = 0;

  // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
  // and submit a bug report
  if(!dataReader.hasRemainder() || parentColumnReader.totalValuesRead == parentColumnReader.columnChunkMetaData.getValueCount()) {
    return false;
  }
  clearBuffers();

  // next, we need to decompress the bytes
  // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one
  // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary
  do {
    long start=inputStream.getPos();
    timer.start();
    pageHeader = dataReader.readPageHeader();
    long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
    this.updateStats(pageHeader, "Page Header Read", start, timeToRead, 0,0);
    logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","",
        this.parentColumnReader.parentReader.fsPath,
        this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead);
    timer.reset();
    if (pageHeader.getType() == PageType.DICTIONARY_PAGE) {
      readDictionaryPage(pageHeader, parentColumnReader);
    }
  } while (pageHeader.getType() == PageType.DICTIONARY_PAGE);

  //TODO: Handle buffer allocation exception

  allocatePageData(pageHeader.getUncompressed_page_size());
  int compressedSize = pageHeader.getCompressed_page_size();
  int uncompressedSize = pageHeader.getUncompressed_page_size();
  readPage(pageHeader, compressedSize, uncompressedSize, pageData);

  currentPageCount = pageHeader.data_page_header.num_values;

  final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);

  final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
  final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);

  byteLength = pageHeader.uncompressed_page_size;

  final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, LargeMemoryUtil.checkedCastToInt(pageData.capacity()));

  readPosInBytes = 0;
  if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
    repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
    repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
    // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
    // read the first zero here to simplify the reading processes, and start reading the first value the same as all
    // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
    // the first list of repetition levels
    readPosInBytes = repetitionLevels.getNextOffset();
    repetitionLevels.readInteger();
  }
  if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0){
    parentColumnReader.currDefLevel = -1;
    definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
    definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    readPosInBytes = definitionLevels.getNextOffset();
    if (!valueEncoding.usesDictionary()) {
      valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
      valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    }
  }
  if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
    valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
    valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
  }
  if (valueEncoding.usesDictionary()) {
    // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
    // actually copying the values out into the vectors
    dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
    dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    dictionaryValueReader = new DictionaryValuesReader(dictionary);
    dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    parentColumnReader.usingDictionary = true;
  } else {
    parentColumnReader.usingDictionary = false;
  }
  // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
  // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
  // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
  // definition and repetition level data which is stored alongside the page data itself
  readyToReadPosInBytes = readPosInBytes;
  return true;
}