Java Code Examples for org.apache.parquet.column.statistics.Statistics#copy()

The following examples show how to use org.apache.parquet.column.statistics.Statistics#copy() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

4 votes

private void innerWriteDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Statistics statistics,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  state = state.write();
  long beforeHeader = out.getPos();
  if (firstPageOffset == -1) {
    firstPageOffset = beforeHeader;
  }
  LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
  int compressedPageSize = (int) bytes.size();
  if (pageWriteChecksumEnabled) {
    crc.reset();
    crc.update(bytes.toByteArray());
    metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      (int) crc.getValue(),
      out);
  } else {
    metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      out);
  }
  long headerSize = out.getPos() - beforeHeader;
  this.uncompressedLength += uncompressedPageSize + headerSize;
  this.compressedLength += compressedPageSize + headerSize;
  LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
  bytes.writeAllTo(out);

  // Copying the statistics if it is not initialized yet so we have the correct typed one
  if (currentStatistics == null) {
    currentStatistics = statistics.copy();
  } else {
    currentStatistics.mergeStatistics(statistics);
  }

  columnIndexBuilder.add(statistics);

  encodingStatsBuilder.addDataEncoding(valuesEncoding);
  currentEncodings.add(rlEncoding);
  currentEncodings.add(dlEncoding);
  currentEncodings.add(valuesEncoding);
}

Example 2

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Writes a single v2 data page
 * @param rowCount count of rows
 * @param nullCount count of nulls
 * @param valueCount count of values
 * @param repetitionLevels repetition level bytes
 * @param definitionLevels definition level bytes
 * @param dataEncoding encoding for data
 * @param compressedData compressed data bytes
 * @param uncompressedDataSize the size of uncompressed data
 * @param statistics the statistics of the page
 * @throws IOException if any I/O error occurs during writing the file
 */
public void writeDataPageV2(int rowCount, int nullCount, int valueCount,
                            BytesInput repetitionLevels,
                            BytesInput definitionLevels,
                            Encoding dataEncoding,
                            BytesInput compressedData,
                            int uncompressedDataSize,
                            Statistics<?> statistics) throws IOException {
  state = state.write();
  int rlByteLength = toIntWithCheck(repetitionLevels.size());
  int dlByteLength = toIntWithCheck(definitionLevels.size());

  int compressedSize = toIntWithCheck(
    compressedData.size() + repetitionLevels.size() + definitionLevels.size()
  );

  int uncompressedSize = toIntWithCheck(
    uncompressedDataSize + repetitionLevels.size() + definitionLevels.size()
  );

  long beforeHeader = out.getPos();
  if (firstPageOffset == -1) {
    firstPageOffset = beforeHeader;
  }

  metadataConverter.writeDataPageV2Header(
    uncompressedSize, compressedSize,
    valueCount, nullCount, rowCount,
    dataEncoding,
    rlByteLength,
    dlByteLength,
    out);

  long headersSize  = out.getPos() - beforeHeader;
  this.uncompressedLength += uncompressedSize + headersSize;
  this.compressedLength += compressedSize + headersSize;

  if (currentStatistics == null) {
    currentStatistics = statistics.copy();
  } else {
    currentStatistics.mergeStatistics(statistics);
  }

  columnIndexBuilder.add(statistics);
  currentEncodings.add(dataEncoding);
  encodingStatsBuilder.addDataEncoding(dataEncoding);

  BytesInput.concat(repetitionLevels, definitionLevels, compressedData)
    .writeAllTo(out);

  offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount);
}

Example 3

Source File: ColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

4 votes

@Override
public void writePage(BytesInput bytes,
                      int valueCount,
                      int rowCount,
                      Statistics statistics,
                      Encoding rlEncoding,
                      Encoding dlEncoding,
                      Encoding valuesEncoding) throws IOException {
  long uncompressedSize = bytes.size();
  if (uncompressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write page larger than Integer.MAX_VALUE bytes: " +
            uncompressedSize);
  }
  BytesInput compressedBytes = compressor.compress(bytes);
  long compressedSize = compressedBytes.size();
  if (compressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
            + compressedSize);
  }
  tempOutputStream.reset();
  if (pageWriteChecksumEnabled) {
    crc.reset();
    crc.update(compressedBytes.toByteArray());
    parquetMetadataConverter.writeDataPageV1Header(
      (int)uncompressedSize,
      (int)compressedSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      (int) crc.getValue(),
      tempOutputStream);
  } else {
    parquetMetadataConverter.writeDataPageV1Header(
      (int)uncompressedSize,
      (int)compressedSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      tempOutputStream);
  }
  this.uncompressedLength += uncompressedSize;
  this.compressedLength += compressedSize;
  this.totalValueCount += valueCount;
  this.pageCount += 1;

  // Copying the statistics if it is not initialized yet so we have the correct typed one
  if (totalStatistics == null) {
    totalStatistics = statistics.copy();
  } else {
    totalStatistics.mergeStatistics(statistics);
  }

  columnIndexBuilder.add(statistics);
  offsetIndexBuilder.add(toIntWithCheck(tempOutputStream.size() + compressedSize), rowCount);

  // by concatenating before collecting instead of collecting twice,
  // we only allocate one buffer to copy into instead of multiple.
  buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes));
  rlEncodings.add(rlEncoding);
  dlEncodings.add(dlEncoding);
  dataEncodings.add(valuesEncoding);
}

Example 4

Source File: ColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

4 votes

@Override
public void writePageV2(
    int rowCount, int nullCount, int valueCount,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    Statistics<?> statistics) throws IOException {
  int rlByteLength = toIntWithCheck(repetitionLevels.size());
  int dlByteLength = toIntWithCheck(definitionLevels.size());
  int uncompressedSize = toIntWithCheck(
      data.size() + repetitionLevels.size() + definitionLevels.size()
  );
  // TODO: decide if we compress
  BytesInput compressedData = compressor.compress(data);
  int compressedSize = toIntWithCheck(
      compressedData.size() + repetitionLevels.size() + definitionLevels.size()
  );
  tempOutputStream.reset();
  parquetMetadataConverter.writeDataPageV2Header(
      uncompressedSize, compressedSize,
      valueCount, nullCount, rowCount,
      dataEncoding,
      rlByteLength,
      dlByteLength,
      tempOutputStream);
  this.uncompressedLength += uncompressedSize;
  this.compressedLength += compressedSize;
  this.totalValueCount += valueCount;
  this.pageCount += 1;

  // Copying the statistics if it is not initialized yet so we have the correct typed one
  if (totalStatistics == null) {
    totalStatistics = statistics.copy();
  } else {
    totalStatistics.mergeStatistics(statistics);
  }

  columnIndexBuilder.add(statistics);
  offsetIndexBuilder.add(toIntWithCheck((long) tempOutputStream.size() + compressedSize), rowCount);

  // by concatenating before collecting instead of collecting twice,
  // we only allocate one buffer to copy into instead of multiple.
  buf.collect(
      BytesInput.concat(
          BytesInput.from(tempOutputStream),
          repetitionLevels,
          definitionLevels,
          compressedData)
  );
  dataEncodings.add(dataEncoding);
}