Java Code Examples for org.apache.parquet.column.statistics.Statistics#isEmpty()

The following examples show how to use org.apache.parquet.column.statistics.Statistics#isEmpty() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean isNull(BoundReference<T> ref) {
  // no need to check whether the field is required because binding evaluates that case
  // if the column has no null values, the expression cannot match
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_MIGHT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty() && colStats.getNumNulls() == 0) {
    // there are stats and no values are null => all values are non-null
    return ROWS_CANNOT_MATCH;
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 2
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp >= 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 3
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 4
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp <= 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 5
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 6
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public <T> Boolean isNull(BoundReference<T> ref) {
  // no need to check whether the field is required because binding evaluates that case
  // if the column has no null values, the expression cannot match
  Integer id = ref.fieldId();
  Preconditions.checkNotNull(struct.field(id),
      "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_MIGHT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty() && colStats.getNumNulls() == 0) {
    // there are stats and no values are null => all values are non-null
    return ROWS_CANNOT_MATCH;
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 7
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) {
  Column<T> filterColumn = gtEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never greater than or
    // equal to a value. for all x, null is never >= x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v >= someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = gtEq.getValue();

  // drop if value > max
  return stats.compareMaxToValue(value) < 0;
}
 
Example 8
Source File: TupleDomainParquetPredicate.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics)
        throws ParquetCorruptionException
{
    if (numberOfRows == 0) {
        return false;
    }
    if (effectivePredicate.isNone()) {
        return false;
    }
    Map<ColumnDescriptor, Domain> effectivePredicateDomains = effectivePredicate.getDomains()
            .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));

    for (RichColumnDescriptor column : columns) {
        Domain effectivePredicateDomain = effectivePredicateDomains.get(column);
        if (effectivePredicateDomain == null) {
            continue;
        }

        Statistics<?> columnStatistics = statistics.get(column);
        if (columnStatistics == null || columnStatistics.isEmpty()) {
            // no stats for column
            continue;
        }

        Domain domain = getDomain(effectivePredicateDomain.getType(), numberOfRows, columnStatistics, id, column.toString(), failOnCorruptedParquetStatistics);
        if (!effectivePredicateDomain.overlaps(domain)) {
            return false;
        }
    }
    return true;
}
 
Example 9
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
  Column<T> filterColumn = lt.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never less than a
    // value. for all x, null is never < x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v < someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = lt.getValue();

  // drop if value <= min
  return stats.compareMinToValue(value) >= 0;
}
 
Example 10
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 11
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 12
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp <= 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 13
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 14
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp >= 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 15
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
  int id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    ByteBuffer prefixAsBytes = lit.toByteBuffer();

    Comparator<ByteBuffer> comparator = Comparators.unsignedBytes();

    Binary lower = colStats.genericGetMin();
    // truncate lower bound so that its length in bytes is not greater than the length of prefix
    int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length());
    int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes);
    if (lowerCmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    Binary upper = colStats.genericGetMax();
    // truncate upper bound so that its length in bytes is not greater than the length of prefix
    int upperLength = Math.min(prefixAsBytes.remaining(), upper.length());
    int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), upperLength), prefixAsBytes);
    if (upperCmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 16
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
  Integer id = ref.fieldId();

  // When filtering nested types notNull() is implicit filter passed even though complex
  // filters aren't pushed down in Parquet. Leave all nested column type filters to be
  // evaluated post scan.
  if (schema.findType(id) instanceof Type.NestedType) {
    return ROWS_MIGHT_MATCH;
  }

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    Collection<T> literals = literalSet;

    T lower = min(colStats, id);
    literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList());
    if (literals.isEmpty()) {  // if all values are less than lower bound, rows cannot match.
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    literals = literals.stream().filter(v -> ref.comparator().compare(upper, v) >= 0).collect(Collectors.toList());
    if (literals.isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match.
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 17
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  // When filtering nested types notNull() is implicit filter passed even though complex
  // filters aren't pushed down in Parquet. Leave all nested column type filters to be
  // evaluated post scan.
  if (schema.findType(id) instanceof Type.NestedType) {
    return ROWS_MIGHT_MATCH;
  }

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 18
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
  Column<T> filterColumn = eq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  T value = eq.getValue();

  if (meta == null) {
    // the column isn't in this file so all values are null.
    if (value != null) {
      // non-null is never null
      return BLOCK_CANNOT_MATCH;
    }
    return BLOCK_MIGHT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (value == null) {
    // We don't know anything about the nulls in this chunk
    if (!stats.isNumNullsSet()) {
      return BLOCK_MIGHT_MATCH;
    }
    // we are looking for records where v eq(null)
    // so drop if there are no nulls in this chunk
    return !hasNulls(meta);
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v eq(someNonNull)
    // and this is a column of all nulls, so drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  // drop if value < min || value > max
  return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0;
}
 
Example 19
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
  Column<T> filterColumn = notEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  T value = notEq.getValue();

  if (meta == null) {
    if (value == null) {
      // null is always equal to null
      return BLOCK_CANNOT_MATCH;
    }
    return BLOCK_MIGHT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (value == null) {
    // we are looking for records where v notEq(null)
    // so, if this is a column of all nulls, we can drop it
    return isAllNulls(meta);
  }

  if (stats.isNumNullsSet() && hasNulls(meta)) {
    // we are looking for records where v notEq(someNonNull)
    // but this chunk contains nulls, we cannot drop it
    return BLOCK_MIGHT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  // drop if this is a column where min = max = value
  return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0;
}
 
Example 20
Source File: StatisticsFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(UserDefined<T, U> ud, boolean inverted) {
  Column<T> filterColumn = ud.getColumn();
  ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
  U udp = ud.getUserDefinedPredicate();

  if (columnChunk == null) {
    // the column isn't in this file so all values are null.
    // lets run the udp with null value to see if it keeps null or not.
    if (inverted) {
      return udp.acceptsNullValue();
    } else {
      return !udp.acceptsNullValue();
    }
  }

  Statistics<T> stats = columnChunk.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(columnChunk)) {
    // lets run the udp with null value to see if it keeps null or not.
    if (inverted) {
      return udp.acceptsNullValue();
    } else {
      return !udp.acceptsNullValue();
    }
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
    new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(),
      stats.comparator());

  if (inverted) {
    return udp.inverseCanDrop(udpStats);
  } else {
    return udp.canDrop(udpStats);
  }
}