Java Code Examples for org.apache.parquet.bytes.BytesInput#size()

The following examples show how to use org.apache.parquet.bytes.BytesInput#size() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetDataOutput.java    From presto with Apache License 2.0 6 votes vote down vote up
static ParquetDataOutput createDataOutput(BytesInput bytesInput)
{
    requireNonNull(bytesInput, "slice is null");
    return new ParquetDataOutput()
    {
        @Override
        public long size()
        {
            return bytesInput.size();
        }

        @Override
        public void writeData(SliceOutput sliceOutput)
        {
            try {
                bytesInput.writeAllTo(sliceOutput);
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    };
}
 
Example 2
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void writePage(BytesInput bytes,
                      int valueCount,
                      Statistics statistics,
                      Encoding rlEncoding,
                      Encoding dlEncoding,
                      Encoding valuesEncoding) throws IOException {
  long uncompressedSize = bytes.size();
  // Parquet library creates bad metadata if the uncompressed or compressed size of a page exceeds Integer.MAX_VALUE
  if (uncompressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write page larger than Integer.MAX_VALUE bytes: " +
            uncompressedSize);
  }
  BytesInput compressedBytes = compressor.compress(bytes);
  long compressedSize = compressedBytes.size();
  if (compressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
            + compressedSize);
  }
  parquetMetadataConverter.writeDataPageHeader(
      (int)uncompressedSize,
      (int)compressedSize,
      valueCount,
      statistics,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      buf);
  this.uncompressedLength += uncompressedSize;
  this.compressedLength += compressedSize;
  this.totalValueCount += valueCount;
  this.pageCount += 1;
  this.totalStatistics.mergeStatistics(statistics);
  compressedBytes.writeAllTo(buf);
  rlEncodings.add(rlEncoding);
  dlEncodings.add(dlEncoding);
  dataEncodings.add(valuesEncoding);
}
 
Example 3
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page is allowed");
  }
  BytesInput dictionaryBytes = dictionaryPage.getBytes();
  int uncompressedSize = (int)dictionaryBytes.size();
  BytesInput compressedBytes = compressor.compress(dictionaryBytes);
  this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize,
      dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
 
Example 4
Source File: MemPageWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writePage(BytesInput bytesInput, int valueCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding)
    throws IOException {
  if (valueCount == 0) {
    throw new ParquetEncodingException("illegal page of 0 values");
  }
  memSize += bytesInput.size();
  pages.add(new DataPageV1(BytesInput.copy(bytesInput), valueCount, (int)bytesInput.size(), statistics, rlEncoding, dlEncoding, valuesEncoding));
  totalValueCount += valueCount;
  LOG.debug("page written for {} bytes and {} records", bytesInput.size(), valueCount);
}
 
Example 5
Source File: MemPageWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writePageV2(int rowCount, int nullCount, int valueCount,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data, Statistics<?> statistics) throws IOException {
  if (valueCount == 0) {
    throw new ParquetEncodingException("illegal page of 0 values");
  }
  long size = repetitionLevels.size() + definitionLevels.size() + data.size();
  memSize += size;
  pages.add(DataPageV2.uncompressed(rowCount, nullCount, valueCount, copy(repetitionLevels), copy(definitionLevels), dataEncoding, copy(data), statistics));
  totalValueCount += valueCount;
  LOG.debug("page written for {} bytes and {} records", size, valueCount);
}
 
Example 6
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * writes a single page
 * @param valueCount count of values
 * @param uncompressedPageSize the size of the data once uncompressed
 * @param bytes the compressed data for the page without header
 * @param rlEncoding encoding of the repetition level
 * @param dlEncoding encoding of the definition level
 * @param valuesEncoding encoding of values
 * @throws IOException if there is an error while writing
 */
@Deprecated
public void writeDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  state = state.write();
  // We are unable to build indexes without rowCount so skip them for this column
  offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
  columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
  long beforeHeader = out.getPos();
  LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
  int compressedPageSize = (int)bytes.size();
  metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      out);
  long headerSize = out.getPos() - beforeHeader;
  this.uncompressedLength += uncompressedPageSize + headerSize;
  this.compressedLength += compressedPageSize + headerSize;
  LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
  bytes.writeAllTo(out);
  encodingStatsBuilder.addDataEncoding(valuesEncoding);
  currentEncodings.add(rlEncoding);
  currentEncodings.add(dlEncoding);
  currentEncodings.add(valuesEncoding);
}
 
Example 7
Source File: ColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page is allowed");
  }
  BytesInput dictionaryBytes = dictionaryPage.getBytes();
  int uncompressedSize = (int)dictionaryBytes.size();
  BytesInput compressedBytes = compressor.compress(dictionaryBytes);
  this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize, dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
 
Example 8
Source File: TestZstandardCodec.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private BytesInput compress(ZstandardCodec codec, BytesInput bytes) throws IOException {
  ByteArrayOutputStream compressedOutBuffer = new ByteArrayOutputStream((int)bytes.size());
  CompressionOutputStream cos = codec.createOutputStream(compressedOutBuffer, null);
  bytes.writeAllTo(cos);
  cos.close();
  return BytesInput.from(compressedOutBuffer);
}
 
Example 9
Source File: PrimitiveColumnWriter.java    From presto with Apache License 2.0 4 votes vote down vote up
private void flushCurrentPageToBuffer()
        throws IOException
{
    ImmutableList.Builder<ParquetDataOutput> outputDataStreams = ImmutableList.builder();

    BytesInput bytes = primitiveValueWriter.getBytes();
    ParquetDataOutput repetitions = createDataOutput(copy(repetitionLevelEncoder.toBytes()));
    ParquetDataOutput definitions = createDataOutput(copy(definitionLevelEncoder.toBytes()));

    // Add encoding should be called after primitiveValueWriter.getBytes() and before primitiveValueWriter.reset()
    encodings.add(primitiveValueWriter.getEncoding());

    long uncompressedSize = bytes.size() + repetitions.size() + definitions.size();

    ParquetDataOutput data;
    long compressedSize;
    if (compressor != null) {
        data = compressor.compress(bytes);
        compressedSize = data.size() + repetitions.size() + definitions.size();
    }
    else {
        data = createDataOutput(copy(bytes));
        compressedSize = uncompressedSize;
    }

    ByteArrayOutputStream pageHeaderOutputStream = new ByteArrayOutputStream();

    Statistics<?> statistics = primitiveValueWriter.getStatistics();
    statistics.incrementNumNulls(currentPageNullCounts);

    columnStatistics.mergeStatistics(statistics);

    parquetMetadataConverter.writeDataPageV2Header((int) uncompressedSize,
            (int) compressedSize,
            currentPageRows,
            currentPageNullCounts,
            currentPageRowCount,
            statistics,
            primitiveValueWriter.getEncoding(),
            (int) repetitions.size(),
            (int) definitions.size(),
            pageHeaderOutputStream);

    ParquetDataOutput pageHeader = createDataOutput(Slices.wrappedBuffer(pageHeaderOutputStream.toByteArray()));
    outputDataStreams.add(pageHeader);
    outputDataStreams.add(repetitions);
    outputDataStreams.add(definitions);
    outputDataStreams.add(data);

    List<ParquetDataOutput> dataOutputs = outputDataStreams.build();

    // update total stats
    totalCompressedSize += pageHeader.size() + compressedSize;
    totalUnCompressedSize += pageHeader.size() + uncompressedSize;
    totalRows += currentPageRows;

    pageBuffer.addAll(dataOutputs);

    // reset page stats
    currentPageRows = 0;
    currentPageNullCounts = 0;
    currentPageRowCount = 0;

    definitionLevelEncoder.reset();
    repetitionLevelEncoder.reset();
    primitiveValueWriter.reset();
}
 
Example 10
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void innerWriteDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Statistics statistics,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  state = state.write();
  long beforeHeader = out.getPos();
  if (firstPageOffset == -1) {
    firstPageOffset = beforeHeader;
  }
  LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
  int compressedPageSize = (int) bytes.size();
  if (pageWriteChecksumEnabled) {
    crc.reset();
    crc.update(bytes.toByteArray());
    metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      (int) crc.getValue(),
      out);
  } else {
    metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      out);
  }
  long headerSize = out.getPos() - beforeHeader;
  this.uncompressedLength += uncompressedPageSize + headerSize;
  this.compressedLength += compressedPageSize + headerSize;
  LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
  bytes.writeAllTo(out);

  // Copying the statistics if it is not initialized yet so we have the correct typed one
  if (currentStatistics == null) {
    currentStatistics = statistics.copy();
  } else {
    currentStatistics.mergeStatistics(statistics);
  }

  columnIndexBuilder.add(statistics);

  encodingStatsBuilder.addDataEncoding(valuesEncoding);
  currentEncodings.add(rlEncoding);
  currentEncodings.add(dlEncoding);
  currentEncodings.add(valuesEncoding);
}
 
Example 11
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * Writes a column chunk at once
 * @param descriptor the descriptor of the column
 * @param valueCount the value count in this column
 * @param compressionCodecName the name of the compression codec used for compressing the pages
 * @param dictionaryPage the dictionary page for this column chunk (might be null)
 * @param bytes the encoded pages including page headers to be written as is
 * @param uncompressedTotalPageSize total uncompressed size (without page headers)
 * @param compressedTotalPageSize total compressed size (without page headers)
 * @param totalStats accumulated statistics for the column chunk
 * @param columnIndexBuilder the builder object for the column index
 * @param offsetIndexBuilder the builder object for the offset index
 * @param bloomFilter the bloom filter for this column
 * @param rlEncodings the RL encodings used in this column chunk
 * @param dlEncodings the DL encodings used in this column chunk
 * @param dataEncodings the data encodings used in this column chunk
 * @throws IOException if there is an error while writing
 */
void writeColumnChunk(ColumnDescriptor descriptor,
    long valueCount,
    CompressionCodecName compressionCodecName,
    DictionaryPage dictionaryPage,
    BytesInput bytes,
    long uncompressedTotalPageSize,
    long compressedTotalPageSize,
    Statistics<?> totalStats,
    ColumnIndexBuilder columnIndexBuilder,
    OffsetIndexBuilder offsetIndexBuilder,
    BloomFilter bloomFilter,
    Set<Encoding> rlEncodings,
    Set<Encoding> dlEncodings,
    List<Encoding> dataEncodings) throws IOException {
  startColumn(descriptor, valueCount, compressionCodecName);

  state = state.write();
  if (dictionaryPage != null) {
    writeDictionaryPage(dictionaryPage);
  }

  if (bloomFilter != null) {
    // write bloom filter if one of data pages is not dictionary encoded
    boolean isWriteBloomFilter = false;
    for (Encoding encoding : dataEncodings) {
      if (encoding != Encoding.RLE_DICTIONARY) {
        isWriteBloomFilter = true;
        break;
      }
    }
    if (isWriteBloomFilter) {
      currentBloomFilters.put(String.join(".", descriptor.getPath()), bloomFilter);
    }
  }
  LOG.debug("{}: write data pages", out.getPos());
  long headersSize = bytes.size() - compressedTotalPageSize;
  this.uncompressedLength += uncompressedTotalPageSize + headersSize;
  this.compressedLength += compressedTotalPageSize + headersSize;
  LOG.debug("{}: write data pages content", out.getPos());
  firstPageOffset = out.getPos();
  bytes.writeAllTo(out);
  encodingStatsBuilder.addDataEncodings(dataEncodings);
  if (rlEncodings.isEmpty()) {
    encodingStatsBuilder.withV2Pages();
  }
  currentEncodings.addAll(rlEncodings);
  currentEncodings.addAll(dlEncodings);
  currentEncodings.addAll(dataEncodings);
  currentStatistics = totalStats;

  this.columnIndexBuilder = columnIndexBuilder;
  this.offsetIndexBuilder = offsetIndexBuilder;

  endColumn();
}
 
Example 12
Source File: ColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public void writePage(BytesInput bytes,
                      int valueCount,
                      int rowCount,
                      Statistics statistics,
                      Encoding rlEncoding,
                      Encoding dlEncoding,
                      Encoding valuesEncoding) throws IOException {
  long uncompressedSize = bytes.size();
  if (uncompressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write page larger than Integer.MAX_VALUE bytes: " +
            uncompressedSize);
  }
  BytesInput compressedBytes = compressor.compress(bytes);
  long compressedSize = compressedBytes.size();
  if (compressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
            + compressedSize);
  }
  tempOutputStream.reset();
  if (pageWriteChecksumEnabled) {
    crc.reset();
    crc.update(compressedBytes.toByteArray());
    parquetMetadataConverter.writeDataPageV1Header(
      (int)uncompressedSize,
      (int)compressedSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      (int) crc.getValue(),
      tempOutputStream);
  } else {
    parquetMetadataConverter.writeDataPageV1Header(
      (int)uncompressedSize,
      (int)compressedSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      tempOutputStream);
  }
  this.uncompressedLength += uncompressedSize;
  this.compressedLength += compressedSize;
  this.totalValueCount += valueCount;
  this.pageCount += 1;

  // Copying the statistics if it is not initialized yet so we have the correct typed one
  if (totalStatistics == null) {
    totalStatistics = statistics.copy();
  } else {
    totalStatistics.mergeStatistics(statistics);
  }

  columnIndexBuilder.add(statistics);
  offsetIndexBuilder.add(toIntWithCheck(tempOutputStream.size() + compressedSize), rowCount);

  // by concatenating before collecting instead of collecting twice,
  // we only allocate one buffer to copy into instead of multiple.
  buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes));
  rlEncodings.add(rlEncoding);
  dlEncodings.add(dlEncoding);
  dataEncodings.add(valuesEncoding);
}
 
Example 13
Source File: DictionaryPage.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * creates an uncompressed page
 * @param bytes the content of the page
 * @param dictionarySize the value count in the dictionary
 * @param encoding the encoding used
 */
public DictionaryPage(BytesInput bytes, int dictionarySize, Encoding encoding) {
  this(bytes, (int)bytes.size(), dictionarySize, encoding); // TODO: fix sizes long or int
}