org.apache.parquet.bytes.BytesInput Java Examples

The following examples show how to use org.apache.parquet.bytes.BytesInput. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractColumnReader.java    From flink with Apache License 2.0 6 votes vote down vote up
private void readPageV1(DataPageV1 page) throws IOException {
	this.pageValueCount = page.getValueCount();
	ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);

	// Initialize the decoders.
	if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) {
		throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding());
	}
	int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
	this.runLenDecoder = new RunLengthDecoder(bitWidth);
	try {
		BytesInput bytes = page.getBytes();
		ByteBufferInputStream in = bytes.toInputStream();
		rlReader.initFromPage(pageValueCount, in);
		this.runLenDecoder.initFromStream(pageValueCount, in);
		prepareNewPage(page.getValueEncoding(), in);
	} catch (IOException e) {
		throw new IOException("could not read page " + page + " in col " + descriptor, e);
	}
}
 
Example #2
Source File: RunLengthBitPackingHybridEncoder.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public BytesInput toBytes() throws IOException {
  Preconditions.checkArgument(!toBytesCalled,
      "You cannot call toBytes() more than once without calling reset()");

  // write anything that is buffered / queued up for an rle-run
  if (repeatCount >= 8) {
    writeRleRun();
  } else if(numBufferedValues > 0) {
    for (int i = numBufferedValues; i < 8; i++) {
      bufferedValues[i] = 0;
    }
    writeOrAppendBitPackedRun();
    endPreviousBitPackedRun();
  } else {
    endPreviousBitPackedRun();
  }

  toBytesCalled = true;
  return BytesInput.from(baos);
}
 
Example #3
Source File: PageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private void initFromPage(DataPageV1 page) {
  this.triplesCount = page.getValueCount();
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(desc, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(desc, DEFINITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  this.definitionLevels = new ValuesReaderIntIterator(dlReader);
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(triplesCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, page.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + desc, e);
  }
}
 
Example #4
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
private static BytesCompressor toDeprecatedBytesCompressor(final BytesInputCompressor compressor) {
  return new BytesCompressor() {
    @Override
    public BytesInput compress(BytesInput bytes) throws IOException {
      return compressor.compress(bytes);
    }

    @Override
    public CompressionCodecName getCodecName() {
      return compressor.getCodecName();
    }

    @Override
    public void release() {
      compressor.release();
    }
  };
}
 
Example #5
Source File: BasePageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
protected void initFromPage(DataPageV1 initPage) {
  this.triplesCount = initPage.getValueCount();
  ValuesReader rlReader = initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  try {
    BytesInput bytes = initPage.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    initDefinitionLevelsReader(initPage, desc, in, triplesCount);
    LOG.debug("reading data at {}", in.position());
    initDataReader(initPage.getValueEncoding(), in, initPage.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + initPage + " in col " + desc, e);
  }
}
 
Example #6
Source File: AvroParquetConvertCreator.java    From datacollector with Apache License 2.0 6 votes vote down vote up
@Override
protected void addNecessaryJarsToJob(Configuration conf) {
  MapreduceUtils.addJarsToJob(conf,
      SemanticVersion.class,
      ParquetWriter.class,
      AvroParquetWriter.class,
      AvroParquetWriterBuilder190Int96.class,
      AvroSchemaConverter190Int96Avro18.class,
      FsInput.class,
      CompressionCodec.class,
      ParquetProperties.class,
      BytesInput.class,
      AvroToParquetConverterUtil.class,
      AvroLogicalTypeSupport.class
  );
}
 
Example #7
Source File: ByteBasedBitPackingEncoder.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * writes an int using the requested number of bits.
 * accepts only values less than 2^bitWidth
 * @param value the value to write
 * @throws IOException if there is an exception while writing
 */
public void writeInt(int value) throws IOException {
  input[inputSize] = value;
  ++ inputSize;
  if (inputSize == VALUES_WRITTEN_AT_A_TIME) {
    pack();
    if (packedPosition == slabSize) {
      slabs.add(BytesInput.from(packed));
      totalFullSlabSize += slabSize;
      if (slabSize < bitWidth * MAX_SLAB_SIZE_MULT) {
        slabSize *= 2;
      }
      initPackedSlab();
    }
  }
}
 
Example #8
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testBinaryDictionary() throws IOException {
  int COUNT = 100;
  ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
  writeRepeated(COUNT, cw, "a");
  BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
  writeRepeated(COUNT, cw, "b");
  BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
  // now we will fall back
  writeDistinct(COUNT, cw, "c");
  BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);

  DictionaryValuesReader cr = initDicReader(cw, BINARY);
  checkRepeated(COUNT, bytes1, cr, "a");
  checkRepeated(COUNT, bytes2, cr, "b");
  BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
  checkDistinct(COUNT, bytes3, cr2, "c");
}
 
Example #9
Source File: DataPageV2.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public DataPageV2(
    int rowCount, int nullCount, int valueCount,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    int uncompressedSize,
    Statistics<?> statistics,
    boolean isCompressed) {
  super(Math.toIntExact(repetitionLevels.size() + definitionLevels.size() + data.size()), uncompressedSize, valueCount);
  this.rowCount = rowCount;
  this.nullCount = nullCount;
  this.repetitionLevels = repetitionLevels;
  this.definitionLevels = definitionLevels;
  this.dataEncoding = dataEncoding;
  this.data = data;
  this.statistics = statistics;
  this.isCompressed = isCompressed;
}
 
Example #10
Source File: DataPageV2.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private DataPageV2(
    int rowCount, int nullCount, int valueCount, long firstRowIndex,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    int uncompressedSize,
    Statistics<?> statistics,
    boolean isCompressed) {
  super(Math.toIntExact(repetitionLevels.size() + definitionLevels.size() + data.size()), uncompressedSize,
      valueCount, firstRowIndex);
  this.rowCount = rowCount;
  this.nullCount = nullCount;
  this.repetitionLevels = repetitionLevels;
  this.definitionLevels = definitionLevels;
  this.dataEncoding = dataEncoding;
  this.data = data;
  this.statistics = statistics;
  this.isCompressed = isCompressed;
}
 
Example #11
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Writes a single page
 * @param valueCount count of values
 * @param uncompressedPageSize the size of the data once uncompressed
 * @param bytes the compressed data for the page without header
 * @param statistics the statistics of the page
 * @param rowCount the number of rows in the page
 * @param rlEncoding encoding of the repetition level
 * @param dlEncoding encoding of the definition level
 * @param valuesEncoding encoding of values
 * @throws IOException if any I/O error occurs during writing the file
 */
public void writeDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Statistics statistics,
    long rowCount,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  long beforeHeader = out.getPos();
  innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding);

  offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount);
}
 
Example #12
Source File: MemPageWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writePage(BytesInput bytesInput, int valueCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding)
    throws IOException {
  if (valueCount == 0) {
    throw new ParquetEncodingException("illegal page of 0 values");
  }
  memSize += bytesInput.size();
  pages.add(new DataPageV1(BytesInput.copy(bytesInput), valueCount, (int)bytesInput.size(), statistics, rlEncoding, dlEncoding, valuesEncoding));
  totalValueCount += valueCount;
  LOG.debug("page written for {} bytes and {} records", bytesInput.size(), valueCount);
}
 
Example #13
Source File: ParquetCompressor.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetDataOutput compress(BytesInput bytesInput)
        throws IOException
{
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    try (GZIPOutputStream outputStream = new GZIPOutputStream(byteArrayOutputStream)) {
        outputStream.write(bytesInput.toByteArray(), 0, toIntExact(bytesInput.size()));
    }
    byte[] bytes = byteArrayOutputStream.toByteArray();
    return createDataOutput(Slices.wrappedBuffer(bytes, 0, bytes.length));
}
 
Example #14
Source File: ByteBasedBitPackingEncoder.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @return the bytes representing the packed values
 * @throws IOException if there is an exception while creating the BytesInput
 */
public BytesInput toBytes() throws IOException {
  int packedByteLength = packedPosition + BytesUtils.paddedByteCountFromBits(inputSize * bitWidth);

  LOG.debug("writing {} bytes", (totalFullSlabSize + packedByteLength));
  if (inputSize > 0) {
    for (int i = inputSize; i < input.length; i++) {
      input[i] = 0;
    }
    pack();
  }
  return concat(concat(slabs), BytesInput.from(packed, 0, packedByteLength));
}
 
Example #15
Source File: DictionaryValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  int maxDicId = getDictionarySize() - 1;
  LOG.debug("max dic id {}", maxDicId);
  int bitWidth = BytesUtils.getWidthFromMaxInt(maxDicId);
  int initialSlabSize =
      CapacityByteArrayOutputStream.initialSlabSizeHeuristic(MIN_INITIAL_SLAB_SIZE, maxDictionaryByteSize, 10);

  RunLengthBitPackingHybridEncoder encoder =
      new RunLengthBitPackingHybridEncoder(bitWidth, initialSlabSize, maxDictionaryByteSize, this.allocator);
  encoders.add(encoder);
  IntIterator iterator = encodedValues.iterator();
  try {
    while (iterator.hasNext()) {
      encoder.writeInt(iterator.next());
    }
    // encodes the bit width
    byte[] bytesHeader = new byte[] { (byte) bitWidth };
    BytesInput rleEncodedBytes = encoder.toBytes();
    LOG.debug("rle encoded bytes {}", rleEncodedBytes.size());
    BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes);
    // remember size of dictionary when we last wrote a page
    lastUsedDictionarySize = getDictionarySize();
    lastUsedDictionaryByteSize = dictionaryByteSize;
    return bytes;
  } catch (IOException e) {
    throw new ParquetEncodingException("could not encode the values", e);
  }
}
 
Example #16
Source File: ParquetCompressor.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetDataOutput compress(BytesInput bytesInput)
        throws IOException
{
    int minCompressionBufferSize = compressor.maxCompressedLength(toIntExact(bytesInput.size()));
    byte[] compressionBuffer = new byte[minCompressionBufferSize];
    byte[] bytes = bytesInput.toByteArray();
    // TODO compressedDataSize > bytes.length?
    int compressedDataSize = compressor.compress(bytes, 0, bytes.length, compressionBuffer, 0, compressionBuffer.length);
    return createDataOutput(Slices.wrappedBuffer(compressionBuffer, 0, compressedDataSize));
}
 
Example #17
Source File: DeltaLengthByteArrayValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  try {
    out.flush();
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write page", e);
  }
  LOG.debug("writing a buffer of size {}", arrayOut.size());
  return BytesInput.concat(lengthWriter.getBytes(), BytesInput.from(arrayOut));
}
 
Example #18
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) {
  try {
    if (maxLevel == 0) {
      return new NullIntIterator();
    }
    return new RLEIntIterator(
        new RunLengthBitPackingHybridDecoder(
            BytesUtils.getWidthFromMaxInt(maxLevel),
            bytes.toInputStream()));
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read levels in page for col " + path, e);
  }
}
 
Example #19
Source File: RunLengthBitPackingHybridValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  try {
    // prepend the length of the column
    BytesInput rle = encoder.toBytes();
    return BytesInput.concat(BytesInput.fromInt(Math.toIntExact(rle.size())), rle);
  } catch (IOException e) {
    throw new ParquetEncodingException(e);
  }
}
 
Example #20
Source File: DeltaBinaryPackingValuesWriterForLong.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * getBytes will trigger flushing block buffer, DO NOT write after getBytes() is called without calling reset()
 *
 * @return a BytesInput that contains the encoded page data
 */
@Override
public BytesInput getBytes() {
  // The Page Header should include: blockSizeInValues, numberOfMiniBlocks, totalValueCount
  if (deltaValuesToFlush != 0) {
    flushBlockBuffer();
  }
  return BytesInput.concat(
          config.toBytesInput(),
          BytesInput.fromUnsignedVarInt(totalValueCount),
          BytesInput.fromZigZagVarLong(firstValue),
          BytesInput.from(baos));
}
 
Example #21
Source File: TestColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private int intValue(BytesInput in) throws IOException {
  ByteArrayOutputStream baos = new ByteArrayOutputStream();
  in.writeAllTo(baos);
  LittleEndianDataInputStream os = new LittleEndianDataInputStream(new ByteArrayInputStream(baos.toByteArray()));
  int i = os.readInt();
  os.close();
  return i;
}
 
Example #22
Source File: DeltaBinaryPackingValuesWriterForInteger.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * getBytes will trigger flushing block buffer, DO NOT write after getBytes() is called without calling reset()
 *
 * @return a BytesInput that contains the encoded page data
 */
@Override
public BytesInput getBytes() {
  // The Page Header should include: blockSizeInValues, numberOfMiniBlocks, totalValueCount
  if (deltaValuesToFlush != 0) {
    flushBlockBuffer();
  }
  return BytesInput.concat(
          config.toBytesInput(),
          BytesInput.fromUnsignedVarInt(totalValueCount),
          BytesInput.fromZigZagVarInt(firstValue),
          BytesInput.from(baos));
}
 
Example #23
Source File: CompressionConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private byte[] translatePageLoad(TransParquetFileReader reader, boolean isCompressed, CompressionCodecFactory.BytesInputCompressor compressor,
                                 CompressionCodecFactory.BytesInputDecompressor decompressor, int payloadLength, int rawDataLength) throws IOException {
  BytesInput data = readBlock(payloadLength, reader);
  if (isCompressed) {
    data = decompressor.decompress(data, rawDataLength);
  }
  BytesInput newCompressedData = compressor.compress(data);
  return newCompressedData.toByteArray();
}
 
Example #24
Source File: DataPageV2.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param rowCount count of rows
 * @param nullCount count of nulls
 * @param valueCount count of values
 * @param firstRowIndex the index of the first row in this page
 * @param repetitionLevels RLE encoded repetition levels
 * @param definitionLevels RLE encoded definition levels
 * @param dataEncoding encoding for the data
 * @param data data encoded with dataEncoding
 * @param statistics optional statistics for this page
 * @return an uncompressed page
 */
public static DataPageV2 uncompressed(
    int rowCount, int nullCount, int valueCount, long firstRowIndex,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    Statistics<?> statistics) {
  return new DataPageV2(
      rowCount, nullCount, valueCount, firstRowIndex,
      repetitionLevels, definitionLevels,
      dataEncoding, data,
      Math.toIntExact(repetitionLevels.size() + definitionLevels.size() + data.size()),
      statistics,
      false);
}
 
Example #25
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testBloomFilterWriteRead() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  File testFile = temp.newFile();
  testFile.delete();
  Path path = new Path(testFile.toURI());
  Configuration configuration = new Configuration();
  configuration.set("parquet.bloom.filter.column.names", "foo");
  String[] colPath = {"foo"};
  ColumnDescriptor col = schema.getColumnDescription(colPath);
  BinaryStatistics stats1 = new BinaryStatistics();
  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(col, 5, CODEC);
  w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0);
  blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello")));
  blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world")));
  w.addBloomFilter("foo", blockSplitBloomFilter);
  w.endBlock();
  w.end(new HashMap<>());
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
  ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path,
    Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath)));
  BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0));
  BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0));
  assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))));
  assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world"))));
}
 
Example #26
Source File: ByteBitPackingValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  try {
    return encoder.toBytes();
  } catch (IOException e) {
    throw new ParquetEncodingException(e);
  }
}
 
Example #27
Source File: FallbackValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  if (!fellBackAlready && firstPage) {
    // we use the first page to decide if we're going to use this encoding
    BytesInput bytes = initialWriter.getBytes();
    if (!initialWriter.isCompressionSatisfying(rawDataByteSize, bytes.size())) {
      fallBack();
    } else {
      return bytes;
    }
  }
  return currentWriter.getBytes();
}
 
Example #28
Source File: TestZstandardCodec.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private BytesInput compress(ZstandardCodec codec, BytesInput bytes) throws IOException {
  ByteArrayOutputStream compressedOutBuffer = new ByteArrayOutputStream((int)bytes.size());
  CompressionOutputStream cos = codec.createOutputStream(compressedOutBuffer, null);
  bytes.writeAllTo(cos);
  cos.close();
  return BytesInput.from(compressedOutBuffer);
}
 
Example #29
Source File: FixedLenByteArrayPlainValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  try {
    out.flush();
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write page", e);
  }
  LOG.debug("writing a buffer of size {}", arrayOut.size());
  return BytesInput.from(arrayOut);
}
 
Example #30
Source File: MemPageWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writePageV2(int rowCount, int nullCount, int valueCount,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data, Statistics<?> statistics) throws IOException {
  if (valueCount == 0) {
    throw new ParquetEncodingException("illegal page of 0 values");
  }
  long size = repetitionLevels.size() + definitionLevels.size() + data.size();
  memSize += size;
  pages.add(DataPageV2.uncompressed(rowCount, nullCount, valueCount, copy(repetitionLevels), copy(definitionLevels), dataEncoding, copy(data), statistics));
  totalValueCount += valueCount;
  LOG.debug("page written for {} bytes and {} records", size, valueCount);
}