org.apache.parquet.column.values.ValuesReader Java Examples

The following examples show how to use org.apache.parquet.column.values.ValuesReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BitPackingPerfTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static long readNTimes(byte[] bytes, int[] result, ValuesReader r)
    throws IOException {
  System.out.println();
  long t = 0;
  int N = 10;
  System.gc();
  System.out.print("                                             " + r.getClass().getSimpleName());
  System.out.print(" no gc <");
  for (int k = 0; k < N; k++) {
    long t2 = System.nanoTime();
    r.initFromPage(result.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    for (int i = 0; i < result.length; i++) {
      result[i] = r.readInteger();
    }
    long t3 = System.nanoTime();
    t += t3 - t2;
  }
  System.out.println("> read in " + t/1000 + "µs " + (N * result.length / (t / 1000)) + " values per µs");
  verify(result);
  return t;
}
 
Example #2
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFloatDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw = newPlainFloatDictionaryValuesWriter(maxDictionaryByteSize, slabSize);

  // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new PlainValuesReader.FloatPlainValuesReader();

  roundTripFloat(cw, reader, maxDictionaryByteSize);
  //simulate cutting the page
  cw.reset();
  assertEquals(0,cw.getBufferedSize());
  cw.resetDictionary();

  roundTripFloat(cw, reader, maxDictionaryByteSize);
}
 
Example #3
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testIntDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(maxDictionaryByteSize, slabSize);

  // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();

  roundTripInt(cw, reader, maxDictionaryByteSize);
  //simulate cutting the page
  cw.reset();
  assertEquals(0,cw.getBufferedSize());
  cw.resetDictionary();

  roundTripInt(cw, reader, maxDictionaryByteSize);
}
 
Example #4
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testDoubleDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw = newPlainDoubleDictionaryValuesWriter(maxDictionaryByteSize, slabSize);

  // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new PlainValuesReader.DoublePlainValuesReader();

  roundTripDouble(cw, reader, maxDictionaryByteSize);
  //simulate cutting the page
  cw.reset();
  assertEquals(0,cw.getBufferedSize());
  cw.resetDictionary();

  roundTripDouble(cw, reader, maxDictionaryByteSize);
}
 
Example #5
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testLongDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw = newPlainLongDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
  // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new PlainValuesReader.LongPlainValuesReader();

  roundTripLong(cw, reader, maxDictionaryByteSize);
  //simulate cutting the page
  cw.reset();
  assertEquals(0,cw.getBufferedSize());
  cw.resetDictionary();

  roundTripLong(cw, reader, maxDictionaryByteSize);
}
 
Example #6
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testSecondPageFallBack() throws IOException {
  int COUNT = 1000;
  ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
  writeRepeated(COUNT, cw, "a");
  BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
  writeDistinct(COUNT, cw, "b");
  // not efficient so falls back
  BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
  writeRepeated(COUNT, cw, "a");
  // still plain because we fell back on previous page
  BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);

  ValuesReader cr = initDicReader(cw, BINARY);
  checkRepeated(COUNT, bytes1, cr, "a");
  cr = new BinaryPlainValuesReader();
  checkDistinct(COUNT, bytes2, cr, "b");
  checkRepeated(COUNT, bytes3, cr, "a");
}
 
Example #7
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFirstPageFallBack() throws IOException {
  int COUNT = 1000;
  ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(10000, 10000);
  writeDistinct(COUNT, cw, "a");
  // not efficient so falls back
  BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN);
  writeRepeated(COUNT, cw, "b");
  // still plain because we fell back on first page
  BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);

  ValuesReader cr = new BinaryPlainValuesReader();
  checkDistinct(COUNT, bytes1, cr, "a");
  checkRepeated(COUNT, bytes2, cr, "b");

}
 
Example #8
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void readPageV1(DataPageV1 page) {
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL);
  this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
  this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
  int valueCount = page.getValueCount();
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} values", bytes.size(), valueCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(valueCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(valueCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, valueCount);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + path, e);
  }
  newPageInitialized(page);
}
 
Example #9
Source File: AbstractColumnReader.java    From flink with Apache License 2.0 6 votes vote down vote up
private void readPageV1(DataPageV1 page) throws IOException {
	this.pageValueCount = page.getValueCount();
	ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);

	// Initialize the decoders.
	if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) {
		throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding());
	}
	int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
	this.runLenDecoder = new RunLengthDecoder(bitWidth);
	try {
		BytesInput bytes = page.getBytes();
		ByteBufferInputStream in = bytes.toInputStream();
		rlReader.initFromPage(pageValueCount, in);
		this.runLenDecoder.initFromStream(pageValueCount, in);
		prepareNewPage(page.getValueEncoding(), in);
	} catch (IOException e) {
		throw new IOException("could not read page " + page + " in col " + descriptor, e);
	}
}
 
Example #10
Source File: PageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private void initFromPage(DataPageV1 page) {
  this.triplesCount = page.getValueCount();
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(desc, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(desc, DEFINITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  this.definitionLevels = new ValuesReaderIntIterator(dlReader);
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(triplesCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, page.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + desc, e);
  }
}
 
Example #11
Source File: PageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  advance();
}
 
Example #12
Source File: BasePageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  this.hasNext = triplesRead < triplesCount;
}
 
Example #13
Source File: PrimitiveColumnReader.java    From presto with Apache License 2.0 6 votes vote down vote up
private ValuesReader readPageV1(DataPageV1 page)
{
    ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL);
    ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL);
    repetitionReader = new LevelValuesReader(rlReader);
    definitionReader = new LevelValuesReader(dlReader);
    try {
        ByteBufferInputStream in = toInputStream(page.getSlice());
        rlReader.initFromPage(page.getValueCount(), in);
        dlReader.initFromPage(page.getValueCount(), in);
        return initDataReader(page.getValueEncoding(), page.getValueCount(), in);
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e);
    }
}
 
Example #14
Source File: TestDeltaByteArray.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testLengths() throws IOException {
  DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator());
  ValuesReader reader = new DeltaBinaryPackingValuesReader();

  Utils.writeData(writer, values);
  ByteBufferInputStream data = writer.getBytes().toInputStream();
  int[] bin = Utils.readInts(reader, data, values.length);

  // test prefix lengths
  Assert.assertEquals(0, bin[0]);
  Assert.assertEquals(7, bin[1]);
  Assert.assertEquals(7, bin[2]);

  reader = new DeltaBinaryPackingValuesReader();
  bin = Utils.readInts(reader, data, values.length);
  // test suffix lengths
  Assert.assertEquals(10, bin[0]);
  Assert.assertEquals(0, bin[1]);
  Assert.assertEquals(7, bin[2]);
}
 
Example #15
Source File: BasePageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
protected void initFromPage(DataPageV1 initPage) {
  this.triplesCount = initPage.getValueCount();
  ValuesReader rlReader = initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  try {
    BytesInput bytes = initPage.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    initDefinitionLevelsReader(initPage, desc, in, triplesCount);
    LOG.debug("reading data at {}", in.position());
    initDataReader(initPage.getValueEncoding(), in, initPage.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + initPage + " in col " + desc, e);
  }
}
 
Example #16
Source File: PrimitiveColumnReader.java    From presto with Apache License 2.0 6 votes vote down vote up
private ValuesReader initDataReader(ParquetEncoding dataEncoding, int valueCount, ByteBufferInputStream in)
{
    ValuesReader valuesReader;
    if (dataEncoding.usesDictionary()) {
        if (dictionary == null) {
            throw new ParquetDecodingException("Dictionary is missing for Page");
        }
        valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary);
    }
    else {
        valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES);
    }

    try {
        valuesReader.initFromPage(valueCount, in);
        return valuesReader;
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e);
    }
}
 
Example #17
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
  ValuesReader previousReader = this.dataColumn;

  this.currentEncoding = dataEncoding;
  this.pageValueCount = valueCount;
  this.endOfPageValueCount = readValues + pageValueCount;

  if (dataEncoding.usesDictionary()) {
    if (dictionary == null) {
      throw new ParquetDecodingException(
          "could not read page in col " + path + " as the dictionary was missing for encoding " + dataEncoding);
    }
    this.dataColumn = dataEncoding.getDictionaryBasedValuesReader(path, VALUES, dictionary);
  } else {
    this.dataColumn = dataEncoding.getValuesReader(path, VALUES);
  }

  if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
    bindToDictionary(dictionary);
  } else {
    bind(path.getType());
  }

  try {
    dataColumn.initFromPage(pageValueCount, in);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page in col " + path, e);
  }

  if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
      previousReader != null && previousReader instanceof RequiresPreviousReader) {
    // previous reader can only be set if reading sequentially
    ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader);
  }
}
 
Example #18
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void roundTripInt(FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw,  ValuesReader reader, int maxDictionaryByteSize) throws IOException {
  int fallBackThreshold = maxDictionaryByteSize / 4;
  for (int i = 0; i < 100; i++) {
    cw.writeInteger(i);
    if (i < fallBackThreshold) {
      assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
    } else {
      assertEquals(cw.getEncoding(), PLAIN);
    }
  }

  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (int i = 0; i < 100; i++) {
    assertEquals(i, reader.readInteger());
  }

  // Test skip with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  for (int i = 0; i < 100; i += 2) {
    assertEquals(i, reader.readInteger());
    reader.skip();
  }

  // Test skip-n with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(i, reader.readInteger());
    reader.skip(skipCount);
  }
}
 
Example #19
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void roundTripDouble(FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw,  ValuesReader reader, int maxDictionaryByteSize) throws IOException {
  int fallBackThreshold = maxDictionaryByteSize / 8;
  for (double i = 0; i < 100; i++) {
    cw.writeDouble(i);
    if (i < fallBackThreshold) {
      assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
    } else {
      assertEquals(cw.getEncoding(), PLAIN);
    }
  }

  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (double i = 0; i < 100; i++) {
    assertEquals(i, reader.readDouble(), 0.00001);
  }

  // Test skip with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  for (int i = 0; i < 100; i += 2) {
    assertEquals(i, reader.readDouble(), 0.0);
    reader.skip();
  }

  // Test skip-n with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(i, reader.readDouble(), 0.0);
    reader.skip(skipCount);
  }
}
 
Example #20
Source File: TestBitPackingColumn.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateEncodeDecode(int bitLength, int[] vals, String expected) throws IOException {
  for (PACKING_TYPE type : PACKING_TYPE.values()) {
    LOG.debug("{}", type);
    final int bound = (int)Math.pow(2, bitLength) - 1;
    ValuesWriter w = type.getWriter(bound);
    for (int i : vals) {
      w.writeInteger(i);
    }
    byte[] bytes = w.getBytes().toByteArray();
    LOG.debug("vals ("+bitLength+"): " + TestBitPacking.toString(vals));
    LOG.debug("bytes: {}", TestBitPacking.toString(bytes));
    assertEquals(type.toString(), expected, TestBitPacking.toString(bytes));
    ValuesReader r = type.getReader(bound);
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    int[] result = new int[vals.length];
    for (int i = 0; i < result.length; i++) {
      result[i] = r.readInteger();
    }
    LOG.debug("result: {}", TestBitPacking.toString(result));
    assertArrayEquals(type + " result: " + TestBitPacking.toString(result), vals, result);

    // Test skipping
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    for (int i = 0; i < vals.length; i += 2) {
      assertEquals(vals[i], r.readInteger());
      r.skip();
    }

    // Test n-skipping
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    int skipCount;
    for (int i = 0; i < vals.length; i += skipCount + 1) {
      skipCount = (vals.length - i) / 2;
      assertEquals(vals[i], r.readInteger());
      r.skip(skipCount);
    }
  }
}
 
Example #21
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void roundTripLong(FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw,  ValuesReader reader, int maxDictionaryByteSize) throws IOException {
  int fallBackThreshold = maxDictionaryByteSize / 8;
  for (long i = 0; i < 100; i++) {
    cw.writeLong(i);
    if (i < fallBackThreshold) {
      assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
    } else {
      assertEquals(cw.getEncoding(), PLAIN);
    }
  }

  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (long i = 0; i < 100; i++) {
    assertEquals(i, reader.readLong());
  }

  // Test skip with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  for (int i = 0; i < 100; i += 2) {
    assertEquals(i, reader.readLong());
    reader.skip();
  }

  // Test skip-n with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(i, reader.readLong());
    reader.skip(skipCount);
  }
}
 
Example #22
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testBinaryDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
  int fallBackThreshold = maxDictionaryByteSize;
  int dataSize=0;
  for (long i = 0; i < 100; i++) {
    Binary binary = Binary.fromString("str" + i);
    cw.writeBytes(binary);
    dataSize += (binary.length() + 4);
    if (dataSize < fallBackThreshold) {
      assertEquals(PLAIN_DICTIONARY, cw.getEncoding());
    } else {
      assertEquals(PLAIN, cw.getEncoding());
    }
  }

  //Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new BinaryPlainValuesReader();
  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (long i = 0; i < 100; i++) {
    assertEquals(Binary.fromString("str" + i), reader.readBytes());
  }

  //simulate cutting the page
  cw.reset();
  assertEquals(0, cw.getBufferedSize());
}
 
Example #23
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSkipInBinaryDictionary() throws Exception {
  ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
  writeRepeated(100, cw, "a");
  writeDistinct(100, cw, "b");
  assertEquals(PLAIN_DICTIONARY, cw.getEncoding());

  // Test skip and skip-n with dictionary encoding
  ByteBufferInputStream stream = cw.getBytes().toInputStream();
  DictionaryValuesReader cr = initDicReader(cw, BINARY);
  cr.initFromPage(200, stream);
  for (int i = 0; i < 100; i += 2) {
    assertEquals(Binary.fromString("a" + i % 10), cr.readBytes());
    cr.skip();
  }
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(Binary.fromString("b" + i), cr.readBytes());
    cr.skip(skipCount);
  }

  // Ensure fallback
  writeDistinct(1000, cw, "c");
  assertEquals(PLAIN, cw.getEncoding());

  // Test skip and skip-n with plain encoding (after fallback)
  ValuesReader plainReader = new BinaryPlainValuesReader();
  plainReader.initFromPage(1200, cw.getBytes().toInputStream());
  plainReader.skip(200);
  for (int i = 0; i < 100; i += 2) {
    assertEquals("c" + i, plainReader.readBytes().toStringUsingUTF8());
    plainReader.skip();
  }
  for (int i = 100; i < 1000; i += skipCount + 1) {
    skipCount = (1000 - i) / 2;
    assertEquals(Binary.fromString("c" + i), plainReader.readBytes());
    plainReader.skip(skipCount);
  }
}
 
Example #24
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void roundTripFloat(FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw,  ValuesReader reader, int maxDictionaryByteSize) throws IOException {
  int fallBackThreshold = maxDictionaryByteSize / 4;
  for (float i = 0; i < 100; i++) {
    cw.writeFloat(i);
    if (i < fallBackThreshold) {
      assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
    } else {
      assertEquals(cw.getEncoding(), PLAIN);
    }
  }

  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (float i = 0; i < 100; i++) {
    assertEquals(i, reader.readFloat(), 0.00001);
  }

  // Test skip with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  for (int i = 0; i < 100; i += 2) {
    assertEquals(i, reader.readFloat(), 0.0f);
    reader.skip();
  }

  // Test skip-n with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(i, reader.readFloat(), 0.0f);
    reader.skip(skipCount);
  }
}
 
Example #25
Source File: TestDeltaLengthByteArray.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testLengths() throws IOException {
  DeltaLengthByteArrayValuesWriter writer = getDeltaLengthByteArrayValuesWriter();
  ValuesReader reader = new DeltaBinaryPackingValuesReader();

  Utils.writeData(writer, values);
  int[] bin = Utils.readInts(reader, writer.getBytes().toInputStream(), values.length);

  for(int i =0; i< bin.length ; i++) {
    Assert.assertEquals(values[i].length(), bin[i]);
  }
}
 
Example #26
Source File: VectorizedPageIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
  ValuesReader previousReader = plainValuesReader;
  if (dataEncoding.usesDictionary()) {
    if (dictionary == null) {
      throw new ParquetDecodingException(
          "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
    }
    try {
      dictionaryEncodedValuesReader =
          new VectorizedDictionaryEncodedParquetValuesReader(desc.getMaxDefinitionLevel(), setArrowValidityVector);
      dictionaryEncodedValuesReader.initFromPage(valueCount, in);
      if (ParquetUtil.isIntType(desc.getPrimitiveType()) || !allPagesDictEncoded) {
        dictionaryDecodeMode = DictionaryDecodeMode.EAGER;
      } else {
        dictionaryDecodeMode = DictionaryDecodeMode.LAZY;
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }
  } else {
    plainValuesReader = new ValuesAsBytesReader();
    plainValuesReader.initFromPage(valueCount, in);
    dictionaryDecodeMode = DictionaryDecodeMode.NONE;
  }
  if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
      previousReader != null && previousReader instanceof RequiresPreviousReader) {
    // previous reader can only be set if reading sequentially
    ((RequiresPreviousReader) plainValuesReader).setPreviousReader(previousReader);
  }
}
 
Example #27
Source File: PageIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
    ValuesReader previousReader = values;

    this.valueEncoding = dataEncoding;

    // TODO: May want to change this so that this class is not dictionary-aware.
    // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries
    // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader
    if (dataEncoding.usesDictionary()) {
      if (dict == null) {
        throw new ParquetDecodingException(
            "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
      }
      this.values = dataEncoding.getDictionaryBasedValuesReader(desc, VALUES, dict);
    } else {
      this.values = dataEncoding.getValuesReader(desc, VALUES);
    }

//    if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
//      bindToDictionary(dictionary);
//    } else {
//      bind(path.getType());
//    }

    try {
      values.initFromPage(valueCount, in);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }

    if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
        previousReader != null && previousReader instanceof RequiresPreviousReader) {
      // previous reader can only be set if reading sequentially
      ((RequiresPreviousReader) values).setPreviousReader(previousReader);
    }
  }
 
Example #28
Source File: PageIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected void initDefinitionLevelsReader(DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in,
                                          int triplesCount) throws IOException {
  ValuesReader dlReader = dataPageV1.getDlEncoding().getValuesReader(desc, ValuesType.DEFINITION_LEVEL);
  this.definitionLevels = new ValuesReaderIntIterator(dlReader);
  dlReader.initFromPage(triplesCount, in);
}
 
Example #29
Source File: PageIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
  protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
    ValuesReader previousReader = values;

    this.valueEncoding = dataEncoding;

    // TODO: May want to change this so that this class is not dictionary-aware.
    // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries
    // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader
    if (dataEncoding.usesDictionary()) {
      if (dictionary == null) {
        throw new ParquetDecodingException(
            "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
      }
      this.values = dataEncoding.getDictionaryBasedValuesReader(desc, ValuesType.VALUES, dictionary);
    } else {
      this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES);
    }

//    if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
//      bindToDictionary(dictionary);
//    } else {
//      bind(path.getType());
//    }

    try {
      values.initFromPage(valueCount, in);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }

    if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
        previousReader instanceof RequiresPreviousReader) {
      // previous reader can only be set if reading sequentially
      ((RequiresPreviousReader) values).setPreviousReader(previousReader);
    }
  }
 
Example #30
Source File: BenchmarkReadingRandomIntegers.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10)
@Test
public void readingRLE() throws IOException {
  for (int j = 0; j < 10; j++) {

    ValuesReader reader = new RunLengthBitPackingHybridValuesReader(32);
    readData(reader, rleBytes);
  }
}