Java Code Examples for org.apache.parquet.bytes.ByteBufferInputStream#wrap()

The following examples show how to use org.apache.parquet.bytes.ByteBufferInputStream#wrap() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroValues() throws IOException {
  FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(100, 100);
  cw.writeInteger(34);
  cw.writeInteger(34);
  getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
  DictionaryValuesReader reader = initDicReader(cw, INT32);

  // pretend there are 100 nulls. what matters is offset = bytes.length.
  ByteBuffer bytes = ByteBuffer.wrap(new byte[] {0x00, 0x01, 0x02, 0x03}); // data doesn't matter
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(bytes);
  stream.skipFully(stream.available());
  reader.initFromPage(100, stream);

  // Testing the deprecated behavior of using byte arrays directly
  reader = initDicReader(cw, INT32);
  int offset = bytes.remaining();
  reader.initFromPage(100,  bytes, offset);
}
 
Example 2
Source File: ByteStreamSplitValuesReaderTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testSkip() throws Exception {
  byte[] byteData = new byte[16];
  for (int i = 0; i < 16; ++i) {
    byteData[i] = (byte) 0xFF;
  }
  byteData[3] = (byte) 0x00;
  byteData[7] = (byte) 0x00;
  byteData[11] = (byte) 0x10;
  byteData[15] = (byte) 0x40;
  ByteBuffer buffer = ByteBuffer.wrap(byteData);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);

  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(4, stream);
  reader.skip(3);
  float f = reader.readFloat();
  assertEquals(2.25f, f, 0.0f);
}
 
Example 3
Source File: PageReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * Enables Parquet column readers to reset the definition level reader to a specific state.
 * @param skipCount the number of rows to skip (optional)
 *
 * @throws IOException An IO related condition
 */
void resetDefinitionLevelReader(int skipCount) throws IOException {
  Preconditions.checkState(parentColumnReader.columnDescriptor.getMaxDefinitionLevel() == 1);
  Preconditions.checkState(currentPageCount > 0);

  final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
  final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);

  final ByteBufferInputStream in = ByteBufferInputStream.wrap(pageData.nioBuffer(0, pageData.capacity()));

  if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
    repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
    repetitionLevels.initFromPage(currentPageCount, in);
    repetitionLevels.readInteger();
  }

  definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
  parentColumnReader.currDefLevel = -1;

  // Now reinitialize the underlying decoder
  definitionLevels.initFromPage(currentPageCount, in);

  // Skip values if requested by caller
  for (int idx = 0; idx < skipCount; ++idx) {
    definitionLevels.skip();
  }
}
 
Example 4
Source File: ByteStreamSplitValuesReaderTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testReader(byte[] input, float[] values) throws IOException {
  ByteBuffer buffer = ByteBuffer.wrap(input);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);
  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(values.length, stream);
  for (float expectedValue : values) {
    float f = reader.readFloat();
    assertEquals(expectedValue, f, 0.0f);
  }
}
 
Example 5
Source File: ByteStreamSplitValuesReaderTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testExtraReads() throws Exception {
  byte[] byteData = {(byte) 0x00, (byte) 0x00, (byte) 0x10, (byte) 0x40};
  ByteBuffer buffer = ByteBuffer.wrap(byteData);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);

  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(1, stream);
  float f = reader.readFloat();
  assertEquals(2.25f, f, 0.0f);
  try {
    reader.readFloat();
    Assert.fail("Expected an exception.");
  } catch (ParquetDecodingException ex) {}
}
 
Example 6
Source File: ByteStreamSplitValuesReaderTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSkipOverflow() throws Exception {
  byte[] byteData = new byte[128];
  ByteBuffer buffer = ByteBuffer.wrap(byteData);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);

  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(32, stream);

  try {
    reader.skip(33);
    Assert.fail("Expected an exception.");
  } catch (ParquetDecodingException ex) {}
}
 
Example 7
Source File: ByteStreamSplitValuesReaderTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSkipUnderflow() throws Exception {
  byte[] byteData = new byte[128];
  ByteBuffer buffer = ByteBuffer.wrap(byteData);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);

  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(32, stream);

  try {
    reader.skip(-1);
    Assert.fail("Expected an exception.");
  } catch (ParquetDecodingException ex) {}
}
 
Example 8
Source File: ByteStreamSplitValuesReaderTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testReader(byte[] input, double[] values) throws IOException {
  ByteBuffer buffer = ByteBuffer.wrap(input);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);
  ByteStreamSplitValuesReaderForDouble reader = new ByteStreamSplitValuesReaderForDouble();
  reader.initFromPage(values.length, stream);
  for (double expectedValue : values) {
    double d = reader.readDouble();
    assertEquals(expectedValue, d, 0.0);
  }
}
 
Example 9
Source File: TestValuesReaderImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateWithByteBufferInputStream(ValuesReader reader) throws IOException {
  ByteBufferInputStream bbis = ByteBufferInputStream.wrap(
      ByteBuffer.wrap("==padding==".getBytes()),
      ByteBuffer.wrap("The expected ".getBytes()),
      ByteBuffer.wrap("page content".getBytes()));
  bbis.skipFully(11);
  reader.initFromPage(25, bbis);
  assertEquals("The expected page content", reader.readBytes().toStringUsingUTF8());
}
 
Example 10
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
  long fileLen = file.getLength();
  LOG.debug("File length {}", fileLen);
  int FOOTER_LENGTH_SIZE = 4;
  if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
    throw new RuntimeException(file.toString() + " is not a Parquet file (too small length: " + fileLen + ")");
  }
  long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
  LOG.debug("reading footer index at {}", footerLengthIndex);

  f.seek(footerLengthIndex);
  int footerLength = readIntLittleEndian(f);
  byte[] magic = new byte[MAGIC.length];
  f.readFully(magic);
  if (!Arrays.equals(MAGIC, magic)) {
    throw new RuntimeException(file.toString() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
  }
  long footerIndex = footerLengthIndex - footerLength;
  LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
  if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
    throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
  }
  f.seek(footerIndex);
  // Read all the footer bytes in one time to avoid multiple read operations,
  // since it can be pretty time consuming for a single read operation in HDFS.
  ByteBuffer footerBytesBuffer = ByteBuffer.allocate(footerLength);
  f.readFully(footerBytesBuffer);
  LOG.debug("Finished to read all footer bytes.");
  footerBytesBuffer.flip();
  InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer);
  return converter.readParquetMetadata(footerBytesStream, options.getMetadataFilter());
}
 
Example 11
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param f file to read the chunks from
 * @param builder used to build chunk list to read the pages for the different columns
 * @throws IOException if there is an error while reading from the stream
 */
public void readAll(SeekableInputStream f, ChunkListBuilder builder) throws IOException {
  List<Chunk> result = new ArrayList<Chunk>(chunks.size());
  f.seek(offset);

  int fullAllocations = length / options.getMaxAllocationSize();
  int lastAllocationSize = length % options.getMaxAllocationSize();

  int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0);
  List<ByteBuffer> buffers = new ArrayList<>(numAllocations);

  for (int i = 0; i < fullAllocations; i += 1) {
    buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize()));
  }

  if (lastAllocationSize > 0) {
    buffers.add(options.getAllocator().allocate(lastAllocationSize));
  }

  for (ByteBuffer buffer : buffers) {
    f.readFully(buffer);
    buffer.flip();
  }

  // report in a counter the data we just scanned
  BenchmarkCounter.incrementBytesRead(length);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers);
  for (int i = 0; i < chunks.size(); i++) {
    ChunkDescriptor descriptor = chunks.get(i);
    builder.add(descriptor, stream.sliceBuffers(descriptor.size), f);
  }
}
 
Example 12
Source File: PageReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws IOException
 */
public boolean next() throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  currentPageCount = -1;
  valuesRead = 0;
  valuesReadyToRead = 0;

  // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
  // and submit a bug report
  long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
  if(parentColumnReader.totalValuesRead >= totalValueCount) {
    return false;
  }
  clearBuffers();

  nextInternal();
  if(pageData == null || pageHeader == null){
    //TODO: Is this an error condition or a normal condition??
    return false;
  }

  timer.start();
  currentPageCount = pageHeader.data_page_header.num_values;

  final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
  final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
  final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);

  byteLength = pageHeader.uncompressed_page_size;

  final ByteBufferInputStream in = ByteBufferInputStream.wrap(pageData.nioBuffer(0, pageData.capacity()));

  readPosInBytes = 0;
  if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
    repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
    repetitionLevels.initFromPage(currentPageCount, in);
    // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
    // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
    // read the first zero here to simplify the reading processes, and start reading the first value the same as all
    // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
    // the first list of repetition levels
    readPosInBytes = in.position();
    repetitionLevels.readInteger();
  }
  if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
    parentColumnReader.currDefLevel = -1;
    definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
    definitionLevels.initFromPage(currentPageCount, in);
    readPosInBytes = in.position();
    if (!valueEncoding.usesDictionary()) {
      valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
      valueReader.initFromPage(currentPageCount, in);
    }
  }
  if (valueReader == null && parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
    valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
    valueReader.initFromPage(currentPageCount, in);
  }
  if (valueEncoding.usesDictionary()) {
    // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
    // actually copying the values out into the vectors
    Preconditions.checkState(readPosInBytes < pageData.capacity());
    int index = (int)readPosInBytes;
    ByteBuffer byteBuffer = pageData.nioBuffer(index, pageData.capacity() - index);
    dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
    dictionaryLengthDeterminingReader.initFromPage(currentPageCount, ByteBufferInputStream.wrap(byteBuffer));
    dictionaryValueReader = new DictionaryValuesReader(dictionary);
    dictionaryValueReader.initFromPage(currentPageCount, ByteBufferInputStream.wrap(byteBuffer));
    parentColumnReader.usingDictionary = true;
  } else {
    parentColumnReader.usingDictionary = false;
  }
  // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
  // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
  // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
  // definition and repetition level data which is stored alongside the page data itself
  readyToReadPosInBytes = readPosInBytes;
  long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
  stats.numDataPagesDecoded.incrementAndGet();
  stats.timeDataPageDecode.addAndGet(timeDecode);
  return true;
}
 
Example 13
Source File: ParquetReaderUtils.java    From presto with Apache License 2.0 4 votes vote down vote up
public static ByteBufferInputStream toInputStream(Slice slice)
{
    return ByteBufferInputStream.wrap(slice.toByteBuffer());
}
 
Example 14
Source File: RunLengthBitPackingHybridIntegrationTest.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void doIntegrationTest(int bitWidth) throws Exception {
  long modValue = 1L << bitWidth;

  RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(bitWidth, 1000, 64000, new DirectByteBufferAllocator());
  for (int i = 0; i < 100; i++) {
    encoder.writeInt((int) (i % modValue));
  }
  for (int i = 0; i < 100; i++) {
    encoder.writeInt((int) (77 % modValue));
  }
  for (int i = 0; i < 100; i++) {
    encoder.writeInt((int) (88 % modValue));
  }
  for (int i = 0; i < 1000; i++) {
    encoder.writeInt((int) (i % modValue));
    encoder.writeInt((int) (i % modValue));
    encoder.writeInt((int) (i % modValue));
  }
  for (int i = 0; i < 1000; i++) {
    encoder.writeInt((int) (17 % modValue));
  }
  ByteBuffer encodedBytes = encoder.toBytes().toByteBuffer();
  ByteBufferInputStream in = ByteBufferInputStream.wrap(encodedBytes);

  RunLengthBitPackingHybridDecoder decoder = new RunLengthBitPackingHybridDecoder(bitWidth, in);

  for (int i = 0; i < 100; i++) {
    assertEquals(i % modValue, decoder.readInt());
  }

  for (int i = 0; i < 100; i++) {
    assertEquals(77 % modValue, decoder.readInt());
  }

  for (int i = 0; i < 100; i++) {
    assertEquals(88 % modValue, decoder.readInt());
  }

  for (int i = 0; i < 1000; i++) {
    assertEquals(i % modValue, decoder.readInt());
    assertEquals(i % modValue, decoder.readInt());
    assertEquals(i % modValue, decoder.readInt());
  }

  for (int i = 0; i < 1000; i++) {
    assertEquals(17 % modValue, decoder.readInt());
  }
}
 
Example 15
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * @param descriptor descriptor for the chunk
 * @param buffers ByteBuffers that contain the chunk
 * @param offsetIndex the offset index for this column; might be null
 */
public Chunk(ChunkDescriptor descriptor, List<ByteBuffer> buffers, OffsetIndex offsetIndex) {
  this.descriptor = descriptor;
  this.stream = ByteBufferInputStream.wrap(buffers);
  this.offsetIndex = offsetIndex;
}