org.apache.parquet.bytes.BytesUtils Java Examples
The following examples show how to use
org.apache.parquet.bytes.BytesUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private void testIntegerStats(StatsHelper helper) { // make fake stats and verify the size check IntStatistics stats = new IntStatistics(); stats.incrementNumNulls(3004); int min = Integer.MIN_VALUE; int max = Integer.MAX_VALUE; stats.updateStats(min); stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToInt(formatStats.getMin())); Assert.assertEquals("Max should match", max, BytesUtils.bytesToInt(formatStats.getMax())); Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count()); }
Example #2
Source File: RunLengthBitPackingHybridEncoder.java From parquet-mr with Apache License 2.0 | 6 votes |
private void writeRleRun() throws IOException { // we may have been working on a bit-packed-run // so close that run if it exists before writing this // rle-run endPreviousBitPackedRun(); // write the rle-header (lsb of 0 signifies a rle run) BytesUtils.writeUnsignedVarInt(repeatCount << 1, baos); // write the repeated-value BytesUtils.writeIntLittleEndianPaddedOnBitWidth(baos, previousValue, bitWidth); // reset the repeat count repeatCount = 0; // throw away all the buffered values, they were just repeats and they've been written numBufferedValues = 0; }
Example #3
Source File: TestRunLengthBitPackingHybridEncoder.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testBitPackingOnly() throws Exception { RunLengthBitPackingHybridEncoder encoder = getRunLengthBitPackingHybridEncoder(); for (int i = 0; i < 100; i++) { encoder.writeInt(i % 3); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = ((104/8) << 1) | 1 = 27 assertEquals(27, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(3, 104, is); for (int i = 0; i < 100; i++) { assertEquals(i % 3, (int) values.get(i)); } // end of stream assertEquals(-1, is.read()); }
Example #4
Source File: TestByteBasedBitPackingEncoder.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testSlabBoundary() { for (int i = 0; i <= 32; i++) { final ByteBasedBitPackingEncoder encoder = new ByteBasedBitPackingEncoder(i, Packer.BIG_ENDIAN); // make sure to write through the progression of slabs final int totalValues = 191 * 1024 * 8 + 10; for (int j = 0; j < totalValues; j++) { try { encoder.writeInt(j); } catch (Exception e) { throw new RuntimeException(i + ": error writing " + j, e); } } assertEquals(BytesUtils.paddedByteCountFromBits(totalValues * i), encoder.getBufferSize()); assertEquals(i == 0 ? 1 : 9, encoder.getNumSlabs()); } }
Example #5
Source File: TestRunLengthBitPackingHybridEncoder.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testPaddingZerosOnUnfinishedBitPackedRuns() throws Exception { RunLengthBitPackingHybridEncoder encoder = getRunLengthBitPackingHybridEncoder(5, 5, 10); for (int i = 0; i < 9; i++) { encoder.writeInt(i+1); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = ((16/8) << 1) | 1 = 5 assertEquals(5, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(5, 16, is); assertEquals(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0), values); assertEquals(-1, is.read()); }
Example #6
Source File: TestRunLengthBitPackingHybridEncoder.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testRepeatedZeros() throws Exception { // previousValue is initialized to 0 // make sure that repeated 0s at the beginning // of the stream don't trip up the repeat count RunLengthBitPackingHybridEncoder encoder = getRunLengthBitPackingHybridEncoder(); for (int i = 0; i < 10; i++) { encoder.writeInt(0); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = 10 << 1 = 20 assertEquals(20, BytesUtils.readUnsignedVarInt(is)); // payload = 4 assertEquals(0, BytesUtils.readIntLittleEndianOnOneByte(is)); // end of stream assertEquals(-1, is.read()); }
Example #7
Source File: TestRunLengthBitPackingHybridEncoder.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testRLEOnly() throws Exception { RunLengthBitPackingHybridEncoder encoder = getRunLengthBitPackingHybridEncoder(); for (int i = 0; i < 100; i++) { encoder.writeInt(4); } for (int i = 0; i < 100; i++) { encoder.writeInt(5); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = 100 << 1 = 200 assertEquals(200, BytesUtils.readUnsignedVarInt(is)); // payload = 4 assertEquals(4, BytesUtils.readIntLittleEndianOnOneByte(is)); // header = 100 << 1 = 200 assertEquals(200, BytesUtils.readUnsignedVarInt(is)); // payload = 5 assertEquals(5, BytesUtils.readIntLittleEndianOnOneByte(is)); // end of stream assertEquals(-1, is.read()); }
Example #8
Source File: DeltaBinaryPackingValuesReader.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * eagerly loads all the data into memory */ @Override public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException { this.in = stream; long startPos = in.position(); this.config = DeltaBinaryPackingConfig.readConfig(in); this.totalValueCount = BytesUtils.readUnsignedVarInt(in); allocateValuesBuffer(); bitWidths = new int[config.miniBlockNumInABlock]; //read first value from header valuesBuffer[valuesBuffered++] = BytesUtils.readZigZagVarLong(in); while (valuesBuffered < totalValueCount) { //values Buffered could be more than totalValueCount, since we flush on a mini block basis loadNewBlockToBuffer(); } updateNextOffset((int) (in.position() - startPos)); }
Example #9
Source File: AbstractColumnReader.java From flink with Apache License 2.0 | 6 votes |
private void readPageV1(DataPageV1 page) throws IOException { this.pageValueCount = page.getValueCount(); ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); // Initialize the decoders. if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) { throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding()); } int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); this.runLenDecoder = new RunLengthDecoder(bitWidth); try { BytesInput bytes = page.getBytes(); ByteBufferInputStream in = bytes.toInputStream(); rlReader.initFromPage(pageValueCount, in); this.runLenDecoder.initFromStream(pageValueCount, in); prepareNewPage(page.getValueEncoding(), in); } catch (IOException e) { throw new IOException("could not read page " + page + " in col " + descriptor, e); } }
Example #10
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private void testLongStats(StatsHelper helper) { // make fake stats and verify the size check LongStatistics stats = new LongStatistics(); stats.incrementNumNulls(3004); long min = Long.MIN_VALUE; long max = Long.MAX_VALUE; stats.updateStats(min); stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToLong(formatStats.getMin())); Assert.assertEquals("Max should match", max, BytesUtils.bytesToLong(formatStats.getMax())); Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count()); }
Example #11
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private void testFloatStats(StatsHelper helper) { // make fake stats and verify the size check FloatStatistics stats = new FloatStatistics(); stats.incrementNumNulls(3004); float min = Float.MIN_VALUE; float max = Float.MAX_VALUE; stats.updateStats(min); stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())), 0.000001); Assert.assertEquals("Max should match", max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())), 0.000001); Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count()); }
Example #12
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private void testDoubleStats(StatsHelper helper) { // make fake stats and verify the size check DoubleStatistics stats = new DoubleStatistics(); stats.incrementNumNulls(3004); double min = Double.MIN_VALUE; double max = Double.MAX_VALUE; stats.updateStats(min); stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())), 0.000001); Assert.assertEquals("Max should match", max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())), 0.000001); Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count()); }
Example #13
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private void testBooleanStats(StatsHelper helper) { // make fake stats and verify the size check BooleanStatistics stats = new BooleanStatistics(); stats.incrementNumNulls(3004); boolean min = Boolean.FALSE; boolean max = Boolean.TRUE; stats.updateStats(min); stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToBool(formatStats.getMin())); Assert.assertEquals("Max should match", max, BytesUtils.bytesToBool(formatStats.getMax())); Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count()); }
Example #14
Source File: DeltaBinaryPackingValuesReader.java From parquet-mr with Apache License 2.0 | 6 votes |
private void loadNewBlockToBuffer() throws IOException { try { minDeltaInCurrentBlock = BytesUtils.readZigZagVarLong(in); } catch (IOException e) { throw new ParquetDecodingException("can not read min delta in current block", e); } readBitWidthsForMiniBlocks(); // mini block is atomic for reading, we read a mini block when there are more values left int i; for (i = 0; i < config.miniBlockNumInABlock && valuesBuffered < totalValueCount; i++) { BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidths[i]); unpackMiniBlock(packer); } //calculate values from deltas unpacked for current block int valueUnpacked=i*config.miniBlockSizeInValues; for (int j = valuesBuffered-valueUnpacked; j < valuesBuffered; j++) { int index = j; valuesBuffer[index] += minDeltaInCurrentBlock + valuesBuffer[index - 1]; } }
Example #15
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testMissingValuesFromStats() { ParquetMetadataConverter converter = new ParquetMetadataConverter(); PrimitiveType type = Types.required(PrimitiveTypeName.INT32).named("test_int32"); org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics(); Statistics<?> stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); assertFalse(stats.isNumNullsSet()); assertFalse(stats.hasNonNullValue()); assertTrue(stats.isEmpty()); assertEquals(-1, stats.getNumNulls()); formatStats.clear(); formatStats.setMin(BytesUtils.intToBytes(-100)); formatStats.setMax(BytesUtils.intToBytes(100)); stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); assertFalse(stats.isNumNullsSet()); assertTrue(stats.hasNonNullValue()); assertFalse(stats.isEmpty()); assertEquals(-1, stats.getNumNulls()); assertEquals(-100, stats.genericGetMin()); assertEquals(100, stats.genericGetMax()); formatStats.clear(); formatStats.setNull_count(2000); stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); assertTrue(stats.isNumNullsSet()); assertFalse(stats.hasNonNullValue()); assertFalse(stats.isEmpty()); assertEquals(2000, stats.getNumNulls()); }
Example #16
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException { long footerIndex = out.getPos(); ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(); org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer); writeFileMetaData(parquetMetadata, out); LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex)); BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex)); out.write(MAGIC); }
Example #17
Source File: TestRunLengthBitPackingHybridEncoder.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testBitPackingOverflow() throws Exception { RunLengthBitPackingHybridEncoder encoder = getRunLengthBitPackingHybridEncoder(); for (int i = 0; i < 1000; i++) { encoder.writeInt(i % 3); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // 504 is the max number of values in a bit packed run // that still has a header of 1 byte // header = ((504/8) << 1) | 1 = 127 assertEquals(127, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(3, 504, is); for (int i = 0; i < 504; i++) { assertEquals(i % 3, (int) values.get(i)); } // there should now be 496 values in another bit-packed run // header = ((496/8) << 1) | 1 = 125 assertEquals(125, BytesUtils.readUnsignedVarInt(is)); values = unpack(3, 496, is); for (int i = 0; i < 496; i++) { assertEquals((i + 504) % 3, (int) values.get(i)); } // end of stream assertEquals(-1, is.read()); }
Example #18
Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0 | 5 votes |
CIBuilder addPage(long nullCount, double min, double max) { nullPages.add(false); nullCounts.add(nullCount); minValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(min)))); maxValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(max)))); return this; }
Example #19
Source File: TestColumnIndexBuilder.java From parquet-mr with Apache License 2.0 | 5 votes |
private static List<ByteBuffer> toBBList(Integer... values) { List<ByteBuffer> buffers = new ArrayList<>(values.length); for (Integer value : values) { if (value == null) { buffers.add(ByteBuffer.allocate(0)); } else { buffers.add(ByteBuffer.wrap(BytesUtils.intToBytes(value))); } } return buffers; }
Example #20
Source File: TestColumnIndexBuilder.java From parquet-mr with Apache License 2.0 | 5 votes |
private static List<ByteBuffer> toBBList(Double... values) { List<ByteBuffer> buffers = new ArrayList<>(values.length); for (Double value : values) { if (value == null) { buffers.add(ByteBuffer.allocate(0)); } else { buffers.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(value)))); } } return buffers; }
Example #21
Source File: DictionaryReader.java From presto with Apache License 2.0 | 5 votes |
@Override public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { int bitWidth = BytesUtils.readIntLittleEndianOnOneByte(in); decoder = new RunLengthBitPackingHybridDecoder(bitWidth, in); }
Example #22
Source File: TestColumnIndexBuilder.java From parquet-mr with Apache License 2.0 | 5 votes |
private static List<ByteBuffer> toBBList(Boolean... values) { List<ByteBuffer> buffers = new ArrayList<>(values.length); for (Boolean value : values) { if (value == null) { buffers.add(ByteBuffer.allocate(0)); } else { buffers.add(ByteBuffer.wrap(BytesUtils.booleanToBytes(value))); } } return buffers; }
Example #23
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testColumnIndexConversion() { PrimitiveType type = Types.required(PrimitiveTypeName.INT64).named("test_int64"); ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); Statistics<?> stats = Statistics.createStats(type); stats.incrementNumNulls(16); stats.updateStats(-100l); stats.updateStats(100l); builder.add(stats); stats = Statistics.createStats(type); stats.incrementNumNulls(111); builder.add(stats); stats = Statistics.createStats(type); stats.updateStats(200l); stats.updateStats(500l); builder.add(stats); org.apache.parquet.format.ColumnIndex parquetColumnIndex = ParquetMetadataConverter.toParquetColumnIndex(type, builder.build()); ColumnIndex columnIndex = ParquetMetadataConverter.fromParquetColumnIndex(type, parquetColumnIndex); assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages())); assertTrue(Arrays.asList(16l, 111l, 0l).equals(columnIndex.getNullCounts())); assertTrue(Arrays.asList( ByteBuffer.wrap(BytesUtils.longToBytes(-100l)), ByteBuffer.allocate(0), ByteBuffer.wrap(BytesUtils.longToBytes(200l))).equals(columnIndex.getMinValues())); assertTrue(Arrays.asList( ByteBuffer.wrap(BytesUtils.longToBytes(100l)), ByteBuffer.allocate(0), ByteBuffer.wrap(BytesUtils.longToBytes(500l))).equals(columnIndex.getMaxValues())); assertNull("Should handle null column index", ParquetMetadataConverter .toParquetColumnIndex(Types.required(PrimitiveTypeName.INT32).named("test_int32"), null)); assertNull("Should ignore unsupported types", ParquetMetadataConverter .toParquetColumnIndex(Types.required(PrimitiveTypeName.INT96).named("test_int96"), columnIndex)); assertNull("Should ignore unsupported types", ParquetMetadataConverter.fromParquetColumnIndex(Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) .length(12).as(OriginalType.INTERVAL).named("test_interval"), parquetColumnIndex)); }
Example #24
Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0 | 5 votes |
CIBuilder addPage(long nullCount, int min, int max) { nullPages.add(false); nullCounts.add(nullCount); minValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(min))); maxValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(max))); return this; }
Example #25
Source File: TestRunLengthBitPackingHybridEncoder.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testTransitionFromBitPackingToRle() throws Exception { RunLengthBitPackingHybridEncoder encoder = getRunLengthBitPackingHybridEncoder(); // 5 obviously bit-packed values encoder.writeInt(0); encoder.writeInt(1); encoder.writeInt(0); encoder.writeInt(1); encoder.writeInt(0); // three repeated values, that ought to be bit-packed as well encoder.writeInt(2); encoder.writeInt(2); encoder.writeInt(2); // lots more repeated values, that should be rle-encoded for (int i = 0; i < 100; i++) { encoder.writeInt(2); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = ((8/8) << 1) | 1 = 3 assertEquals(3, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(3, 8, is); assertEquals(Arrays.asList(0, 1, 0, 1, 0, 2, 2, 2), values); // header = 100 << 1 = 200 assertEquals(200, BytesUtils.readUnsignedVarInt(is)); // payload = 2 assertEquals(2, BytesUtils.readIntLittleEndianOnOneByte(is)); // end of stream assertEquals(-1, is.read()); }
Example #26
Source File: TestColumnIndexBuilder.java From parquet-mr with Apache License 2.0 | 5 votes |
private static List<ByteBuffer> toBBList(Float... values) { List<ByteBuffer> buffers = new ArrayList<>(values.length); for (Float value : values) { if (value == null) { buffers.add(ByteBuffer.allocate(0)); } else { buffers.add(ByteBuffer.wrap(BytesUtils.intToBytes(Float.floatToIntBits(value)))); } } return buffers; }
Example #27
Source File: TestRunLengthBitPackingHybridEncoder.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testBitWidthZero() throws Exception { RunLengthBitPackingHybridEncoder encoder = getRunLengthBitPackingHybridEncoder(0, 5, 10); for (int i = 0; i < 10; i++) { encoder.writeInt(0); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = 10 << 1 = 20 assertEquals(20, BytesUtils.readUnsignedVarInt(is)); // end of stream assertEquals(-1, is.read()); }
Example #28
Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0 | 5 votes |
private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { try { if (maxLevel == 0) { return new NullIntIterator(); } return new RLEIntIterator( new RunLengthBitPackingHybridDecoder( BytesUtils.getWidthFromMaxInt(maxLevel), bytes.toInputStream())); } catch (IOException e) { throw new ParquetDecodingException("could not read levels in page for col " + path, e); } }
Example #29
Source File: BinaryPlainValuesReader.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void skip() { try { int length = BytesUtils.readIntLittleEndian(in); in.skipFully(length); } catch (IOException | RuntimeException e) { throw new ParquetDecodingException("could not skip bytes at offset " + in.position(), e); } }
Example #30
Source File: BinaryPlainValuesReader.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public Binary readBytes() { try { int length = BytesUtils.readIntLittleEndian(in); return Binary.fromConstantByteBuffer(in.slice(length)); } catch (IOException | RuntimeException e) { throw new ParquetDecodingException("could not read bytes at offset " + in.position(), e); } }