org.apache.parquet.column.statistics.Statistics Java Examples

The following examples show how to use org.apache.parquet.column.statistics.Statistics. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static <T extends Comparable<T>>
Statistics<T> getStatisticsFromPageHeader(DataPage page) {
  return page.accept(new DataPage.Visitor<Statistics<T>>() {
    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV1 dataPageV1) {
      return (Statistics<T>) dataPageV1.getStatistics();
    }

    @Override
    @SuppressWarnings("unchecked")
    public Statistics<T> visit(DataPageV2 dataPageV2) {
      return (Statistics<T>) dataPageV2.getStatistics();
    }
  });
}
 
Example #2
Source File: ColumnWriterV2.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
void writePage(int rowCount, int valueCount, Statistics<?> statistics, ValuesWriter repetitionLevels,
    ValuesWriter definitionLevels, ValuesWriter values) throws IOException {
  // TODO: rework this API. The bytes shall be retrieved before the encoding (encoding might be different otherwise)
  BytesInput bytes = values.getBytes();
  Encoding encoding = values.getEncoding();
  pageWriter.writePageV2(
      rowCount,
      Math.toIntExact(statistics.getNumNulls()),
      valueCount,
      repetitionLevels.getBytes(),
      definitionLevels.getBytes(),
      encoding,
      bytes,
      statistics);
}
 
Example #3
Source File: ColumnIndexBuilder.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Adds the data from the specified statistics to this builder
 *
 * @param stats
 *          the statistics to be added
 */
public void add(Statistics<?> stats) {
  if (stats.hasNonNullValue()) {
    nullPages.add(false);
    Object min = stats.genericGetMin();
    Object max = stats.genericGetMax();
    addMinMax(min, max);
    pageIndexes.add(nextPageIndex);
    minMaxSize += sizeOf(min);
    minMaxSize += sizeOf(max);
  } else {
    nullPages.add(true);
  }
  nullCounts.add(stats.getNumNulls());
  ++nextPageIndex;
}
 
Example #4
Source File: DataPageV2.java    From presto with Apache License 2.0 6 votes vote down vote up
public DataPageV2(
        int rowCount,
        int nullCount,
        int valueCount,
        Slice repetitionLevels,
        Slice definitionLevels,
        ParquetEncoding dataEncoding,
        Slice slice,
        int uncompressedSize,
        Statistics<?> statistics,
        boolean isCompressed)
{
    super(uncompressedSize, valueCount);
    this.rowCount = rowCount;
    this.nullCount = nullCount;
    this.repetitionLevels = requireNonNull(repetitionLevels, "repetitionLevels slice is null");
    this.definitionLevels = requireNonNull(definitionLevels, "definitionLevels slice is null");
    this.dataEncoding = dataEncoding;
    this.slice = requireNonNull(slice, "slice is null");
    this.statistics = statistics;
    this.isCompressed = isCompressed;
}
 
Example #5
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example #6
Source File: HdfsOffsetComputer.java    From garmadon with Apache License 2.0 6 votes vote down vote up
protected Long getMaxOffset(Map<String, FinalEventPartitionFile> dateFinalEventPartitionFile) {
    // Get max offset from all files for a partition
    return dateFinalEventPartitionFile
        .values()
        .stream()
        .flatMap(finalEventPartitionFile -> {
            try (ParquetFileReader pFR = ParquetFileReader.open(fs.getConf(), finalEventPartitionFile.getFilePath())) {
                return pFR.getFooter().getBlocks().stream();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }

        })
        .map(b -> b.getColumns().stream()
            .filter(column -> Arrays.stream(column.getPath().toArray()).allMatch(path -> path.equals("kafka_offset")))
            .findFirst()
            .map(ColumnChunkMetaData::getStatistics)
            .map(Statistics::genericGetMax)
            .map(Long.class::cast)
            .orElse(NO_OFFSET))
        .mapToLong(Long::longValue)
        .max()
        .orElse(NO_OFFSET);
}
 
Example #7
Source File: PrimitiveColumnWriter.java    From presto with Apache License 2.0 6 votes vote down vote up
public PrimitiveColumnWriter(Type type, ColumnDescriptor columnDescriptor, PrimitiveValueWriter primitiveValueWriter, RunLengthBitPackingHybridEncoder definitionLevelEncoder, RunLengthBitPackingHybridEncoder repetitionLevelEncoder, CompressionCodecName compressionCodecName, int pageSizeThreshold)
{
    this.type = requireNonNull(type, "type is null");
    this.columnDescriptor = requireNonNull(columnDescriptor, "columnDescriptor is null");
    this.maxDefinitionLevel = columnDescriptor.getMaxDefinitionLevel();

    this.definitionLevelEncoder = requireNonNull(definitionLevelEncoder, "definitionLevelEncoder is null");
    this.repetitionLevelEncoder = requireNonNull(repetitionLevelEncoder, "repetitionLevelEncoder is null");
    this.primitiveValueWriter = requireNonNull(primitiveValueWriter, "primitiveValueWriter is null");
    this.encodings = new HashSet<>();
    this.compressionCodec = requireNonNull(compressionCodecName, "compressionCodecName is null");
    this.compressor = getCompressor(compressionCodecName);
    this.pageSizeThreshold = pageSizeThreshold;

    this.columnStatistics = Statistics.createStats(columnDescriptor.getPrimitiveType());
}
 
Example #8
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void testFloatStats(StatsHelper helper) {
  // make fake stats and verify the size check
  FloatStatistics stats = new FloatStatistics();
  stats.incrementNumNulls(3004);
  float min = Float.MIN_VALUE;
  float max = Float.MAX_VALUE;
  stats.updateStats(min);
  stats.updateStats(max);

  org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);

  Assert.assertEquals("Min should match",
      min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())),
      0.000001);
  Assert.assertEquals("Max should match",
      max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())),
      0.000001);
  Assert.assertEquals("Num nulls should match",
      3004, formatStats.getNull_count());
}
 
Example #9
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void testIntegerStats(StatsHelper helper) {
  // make fake stats and verify the size check
  IntStatistics stats = new IntStatistics();
  stats.incrementNumNulls(3004);
  int min = Integer.MIN_VALUE;
  int max = Integer.MAX_VALUE;
  stats.updateStats(min);
  stats.updateStats(max);

  org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);

  Assert.assertEquals("Min should match",
      min, BytesUtils.bytesToInt(formatStats.getMin()));
  Assert.assertEquals("Max should match",
      max, BytesUtils.bytesToInt(formatStats.getMax()));
  Assert.assertEquals("Num nulls should match",
      3004, formatStats.getNull_count());
}
 
Example #10
Source File: ColumnChunkMetaData.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param path column identifier
 * @param type type of the column
 * @param codec
 * @param encodings
 * @param statistics
 * @param firstDataPageOffset
 * @param dictionaryPageOffset
 * @param valueCount
 * @param totalSize
 * @param totalUncompressedSize
 */
LongColumnChunkMetaData(
    ColumnPath path,
    PrimitiveType type,
    CompressionCodecName codec,
    EncodingStats encodingStats,
    Set<Encoding> encodings,
    Statistics statistics,
    long firstDataPageOffset,
    long dictionaryPageOffset,
    long valueCount,
    long totalSize,
    long totalUncompressedSize) {
  super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings));
  this.firstDataPageOffset = firstDataPageOffset;
  this.dictionaryPageOffset = dictionaryPageOffset;
  this.valueCount = valueCount;
  this.totalSize = totalSize;
  this.totalUncompressedSize = totalUncompressedSize;
  this.statistics = statistics;
}
 
Example #11
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example #12
Source File: ColumnChunkMetaData.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param path column identifier
 * @param type type of the column
 * @param codec
 * @param encodings
 * @param statistics
 * @param firstDataPage
 * @param dictionaryPageOffset
 * @param valueCount
 * @param totalSize
 * @param totalUncompressedSize
 */
IntColumnChunkMetaData(
    ColumnPath path,
    PrimitiveType type,
    CompressionCodecName codec,
    EncodingStats encodingStats,
    Set<Encoding> encodings,
    Statistics statistics,
    long firstDataPage,
    long dictionaryPageOffset,
    long valueCount,
    long totalSize,
    long totalUncompressedSize) {
  super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings));
  this.firstDataPage = positiveLongToInt(firstDataPage);
  this.dictionaryPageOffset = positiveLongToInt(dictionaryPageOffset);
  this.valueCount = positiveLongToInt(valueCount);
  this.totalSize = positiveLongToInt(totalSize);
  this.totalUncompressedSize = positiveLongToInt(totalUncompressedSize);
  this.statistics = statistics;
}
 
Example #13
Source File: ColumnChunkMetaData.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Deprecated
public static ColumnChunkMetaData get(
    ColumnPath path,
    PrimitiveTypeName type,
    CompressionCodecName codec,
    Set<Encoding> encodings,
    Statistics statistics,
    long firstDataPage,
    long dictionaryPageOffset,
    long valueCount,
    long totalSize,
    long totalUncompressedSize) {
  return get(
      path, type, codec, null, encodings, statistics, firstDataPage, dictionaryPageOffset,
      valueCount, totalSize, totalUncompressedSize);
}
 
Example #14
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void testDoubleStats(StatsHelper helper) {
  // make fake stats and verify the size check
  DoubleStatistics stats = new DoubleStatistics();
  stats.incrementNumNulls(3004);
  double min = Double.MIN_VALUE;
  double max = Double.MAX_VALUE;
  stats.updateStats(min);
  stats.updateStats(max);

  org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);

  Assert.assertEquals("Min should match",
      min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())),
      0.000001);
  Assert.assertEquals("Max should match",
      max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())),
      0.000001);
  Assert.assertEquals("Num nulls should match",
      3004, formatStats.getNull_count());
}
 
Example #15
Source File: DataPageV2.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public DataPageV2(
    int rowCount, int nullCount, int valueCount,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    int uncompressedSize,
    Statistics<?> statistics,
    boolean isCompressed) {
  super(Math.toIntExact(repetitionLevels.size() + definitionLevels.size() + data.size()), uncompressedSize, valueCount);
  this.rowCount = rowCount;
  this.nullCount = nullCount;
  this.repetitionLevels = repetitionLevels;
  this.definitionLevels = definitionLevels;
  this.dataEncoding = dataEncoding;
  this.data = data;
  this.statistics = statistics;
  this.isCompressed = isCompressed;
}
 
Example #16
Source File: DataPageV2.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param rowCount count of rows
 * @param nullCount count of nulls
 * @param valueCount count of values
 * @param repetitionLevels RLE encoded repetition levels
 * @param definitionLevels RLE encoded definition levels
 * @param dataEncoding encoding for the data
 * @param data data encoded with dataEncoding and compressed
 * @param uncompressedSize total size uncompressed (rl + dl + data)
 * @param statistics optional statistics for this page
 * @return a compressed page
 */
public static DataPageV2 compressed(
    int rowCount, int nullCount, int valueCount,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    int uncompressedSize,
    Statistics<?> statistics) {
  return new DataPageV2(
      rowCount, nullCount, valueCount,
      repetitionLevels, definitionLevels,
      dataEncoding, data,
      uncompressedSize,
      statistics,
      true);
}
 
Example #17
Source File: DataPageV2.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param rowCount count of rows
 * @param nullCount count of nulls
 * @param valueCount count of values
 * @param firstRowIndex the index of the first row in this page
 * @param repetitionLevels RLE encoded repetition levels
 * @param definitionLevels RLE encoded definition levels
 * @param dataEncoding encoding for the data
 * @param data data encoded with dataEncoding
 * @param statistics optional statistics for this page
 * @return an uncompressed page
 */
public static DataPageV2 uncompressed(
    int rowCount, int nullCount, int valueCount, long firstRowIndex,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    Statistics<?> statistics) {
  return new DataPageV2(
      rowCount, nullCount, valueCount, firstRowIndex,
      repetitionLevels, definitionLevels,
      dataEncoding, data,
      Math.toIntExact(repetitionLevels.size() + definitionLevels.size() + data.size()),
      statistics,
      false);
}
 
Example #18
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Writes a single page
 * @param valueCount count of values
 * @param uncompressedPageSize the size of the data once uncompressed
 * @param bytes the compressed data for the page without header
 * @param statistics the statistics of the page
 * @param rowCount the number of rows in the page
 * @param rlEncoding encoding of the repetition level
 * @param dlEncoding encoding of the definition level
 * @param valuesEncoding encoding of values
 * @throws IOException if any I/O error occurs during writing the file
 */
public void writeDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Statistics statistics,
    long rowCount,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  long beforeHeader = out.getPos();
  innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding);

  offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount);
}
 
Example #19
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}
 
Example #20
Source File: PrintFooter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void add(ColumnDescriptor desc, long valueCount, long size, long uncSize, Collection<Encoding> encodings, Statistics colValuesStats) {
  ColStats colStats = stats.get(desc);
  if (colStats == null) {
    colStats = new ColStats();
    stats.put(desc, colStats);
  }
  colStats.add(valueCount, size, uncSize, encodings, colValuesStats);
}
 
Example #21
Source File: ColumnIndexBuilder.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public <T extends Comparable<T>, U extends UserDefinedPredicate<T>> PrimitiveIterator.OfInt visit(
    LogicalNotUserDefined<T, U> udp) {
  final UserDefinedPredicate<T> inversePredicate = udp.getUserDefined().getUserDefinedPredicate();
  final boolean acceptNulls = !inversePredicate.acceptsNullValue();

  if (acceptNulls && nullCounts == null) {
    // Nulls match so if we don't have null related statistics we have to return all pages
    return IndexIterator.all(getPageCount());
  }

  return IndexIterator.filter(getPageCount(), new IntPredicate() {
    private int arrayIndex = -1;

    @Override
    public boolean test(int pageIndex) {
      if (isNullPage(pageIndex)) {
        return acceptNulls;
      } else {
        ++arrayIndex;
        if (acceptNulls && nullCounts[pageIndex] > 0) {
          return true;
        }
        org.apache.parquet.filter2.predicate.Statistics<T> stats = createStats(arrayIndex);
        return !inversePredicate.inverseCanDrop(stats);
      }
    }
  });
}
 
Example #22
Source File: TestUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static void assertStatsValuesEqual(String message, Statistics<?> expected, Statistics<?> actual) {
  if (expected == actual) {
    return;
  }
  if (expected == null || actual == null) {
    Assert.assertEquals(expected, actual);
  }
  Assert.assertThat(actual, CoreMatchers.instanceOf(expected.getClass()));
  Assert.assertArrayEquals(message, expected.getMaxBytes(), actual.getMaxBytes());
  Assert.assertArrayEquals(message, expected.getMinBytes(), actual.getMinBytes());
  Assert.assertEquals(message, expected.getNumNulls(), actual.getNumNulls());
}
 
Example #23
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example #24
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static Statistics<?> createStatsTyped(PrimitiveType type, BigInteger min, BigInteger max) {
  Statistics<?> stats = Statistics.createStats(type);
  Binary minBinary = FixedBinaryTestUtils.getFixedBinary(type, min);
  Binary maxBinary = FixedBinaryTestUtils.getFixedBinary(type, max);
  stats.updateStats(maxBinary);
  stats.updateStats(minBinary);
  assertEquals(minBinary, stats.genericGetMin());
  assertEquals(maxBinary, stats.genericGetMax());
  return stats;
}
 
Example #25
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateStatsForPage(DataPage page, DictionaryPage dict,
                                  ColumnDescriptor desc) {
  SingletonPageReader reader = new SingletonPageReader(dict, page);
  PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
  Statistics stats = getStatisticsFromPageHeader(page);

  long numNulls = 0;

  ColumnReader column = COL_READER_CTOR.newInstance(desc, reader, converter, null);
  for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
    if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) {
      column.writeCurrentValueToConverter();
    } else {
      numNulls += 1;
    }
    column.consume();
  }

  if (numNulls != stats.getNumNulls()) {
    throw new BadStatsException("Number of nulls doesn't match.");
  }

  console.debug(String.format(
      "Validated stats min=%s max=%s nulls=%d for page=%s col=%s",
      stats.minAsString(),
      stats.maxAsString(), stats.getNumNulls(), page,
      Arrays.toString(desc.getPath())));
}
 
Example #26
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static <T> Statistics<?> createStats(PrimitiveType type, T min, T max) {
  Class<?> c = min.getClass();
  if (c == Integer.class) {
    return createStatsTyped(type, (Integer) min, (Integer) max);
  } else if (c == Long.class) {
    return createStatsTyped(type, (Long) min, (Long) max);
  } else if (c == BigInteger.class) {
    return createStatsTyped(type, (BigInteger) min, (BigInteger) max);
  }
  fail("Not implemented");
  return null;
}
 
Example #27
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example #28
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example #29
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp <= 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example #30
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void writePage(BytesInput bytes,
                      int valueCount,
                      Statistics statistics,
                      Encoding rlEncoding,
                      Encoding dlEncoding,
                      Encoding valuesEncoding) throws IOException {
  long uncompressedSize = bytes.size();
  // Parquet library creates bad metadata if the uncompressed or compressed size of a page exceeds Integer.MAX_VALUE
  if (uncompressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write page larger than Integer.MAX_VALUE bytes: " +
            uncompressedSize);
  }
  BytesInput compressedBytes = compressor.compress(bytes);
  long compressedSize = compressedBytes.size();
  if (compressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
            + compressedSize);
  }
  parquetMetadataConverter.writeDataPageHeader(
      (int)uncompressedSize,
      (int)compressedSize,
      valueCount,
      statistics,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      buf);
  this.uncompressedLength += uncompressedSize;
  this.compressedLength += compressedSize;
  this.totalValueCount += valueCount;
  this.pageCount += 1;
  this.totalStatistics.mergeStatistics(statistics);
  compressedBytes.writeAllTo(buf);
  rlEncodings.add(rlEncoding);
  dlEncodings.add(dlEncoding);
  dataEncodings.add(valuesEncoding);
}