Java Code Examples for org.apache.parquet.column.statistics.Statistics#genericGetMax()

The following examples show how to use org.apache.parquet.column.statistics.Statistics#genericGetMax() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ColumnIndexBuilder.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Adds the data from the specified statistics to this builder
 *
 * @param stats
 *          the statistics to be added
 */
public void add(Statistics<?> stats) {
  if (stats.hasNonNullValue()) {
    nullPages.add(false);
    Object min = stats.genericGetMin();
    Object max = stats.genericGetMax();
    addMinMax(min, max);
    pageIndexes.add(nextPageIndex);
    minMaxSize += sizeOf(min);
    minMaxSize += sizeOf(max);
  } else {
    nullPages.add(true);
  }
  nullCounts.add(stats.getNumNulls());
  ++nextPageIndex;
}

Example 2

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
  int id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    ByteBuffer prefixAsBytes = lit.toByteBuffer();

    Comparator<ByteBuffer> comparator = Comparators.unsignedBytes();

    Binary lower = colStats.genericGetMin();
    // truncate lower bound so that its length in bytes is not greater than the length of prefix
    int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length());
    int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes);
    if (lowerCmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    Binary upper = colStats.genericGetMax();
    // truncate upper bound so that its length in bytes is not greater than the length of prefix
    int upperLength = Math.min(prefixAsBytes.remaining(), upper.length());
    int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), upperLength), prefixAsBytes);
    if (upperCmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Example 3

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}

Example 4

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}

Example 5

Source File: Metadata.java From dremio-oss with Apache License 2.0

4 votes

private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}

Example 6

Source File: ParquetReaderUtility.java From dremio-oss with Apache License 2.0

4 votes

/**
 * Detect corrupt date values by looking at the min/max values in the metadata.
 *
 * This should only be used when a file does not have enough metadata to determine if
 * the data was written with an older version of Drill, or an external tool. Drill
 * versions 1.3 and beyond should have enough metadata to confirm that the data was written
 * by Drill.
 *
 * This method only checks the first Row Group, because Drill has only ever written
 * a single Row Group per file.
 *
 * @param footer
 * @param columns
 * @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
 *                                of corrupt dates. There are some rare cases (storing dates thousands
 *                                of years into the future, with tools other than Drill writing files)
 *                                that would result in the date values being "corrected" into bad values.
 */
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
                                                            List<SchemaPath> columns,
                                                            boolean autoCorrectCorruptDates) {
  // Users can turn-off date correction in cases where we are detecting corruption based on the date values
  // that are unlikely to appear in common datasets. In this case report that no correction needs to happen
  // during the file read
  if (! autoCorrectCorruptDates) {
    return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
  }
  // Drill produced files have only ever have a single row group, if this changes in the future it won't matter
  // as we will know from the Drill version written in the files that the dates are correct
  int rowGroupIndex = 0;
  Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
  findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
    List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
    for (int i = 0; i < parquetColumns.size(); ++i) {
      ColumnDescriptor column = parquetColumns.get(i);
      // this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
      // creating a NameSegment makes sure we are using the standard code for comparing names,
      // currently it is all case-insensitive
      if (ColumnUtils.isStarQuery(columns) || new PathSegment.NameSegment(column.getPath()[0]).equals(schemaPath.getRootSegment())) {
        int colIndex = -1;
        ConvertedType convertedType = schemaElements.get(column.getPath()[0]).getConverted_type();
        if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
          List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
          for (int j = 0; j < colChunkList.size(); j++) {
            if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
              colIndex = j;
              break;
            }
          }
        }
        if (colIndex == -1) {
          // column does not appear in this file, skip it
          continue;
        }
        Statistics statistics = footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
        Integer max = (Integer) statistics.genericGetMax();
        if (statistics.hasNonNullValue()) {
          if (max > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
            return DateCorruptionStatus.META_SHOWS_CORRUPTION;
          }
        } else {
          // no statistics, go check the first page
          return DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
        }
      }
    }
  }
  return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}

Example 7

Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0

4 votes

private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(UserDefined<T, U> ud, boolean inverted) {
  Column<T> filterColumn = ud.getColumn();
  ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
  U udp = ud.getUserDefinedPredicate();

  if (columnChunk == null) {
    // the column isn't in this file so all values are null.
    // lets run the udp with null value to see if it keeps null or not.
    if (inverted) {
      return udp.acceptsNullValue();
    } else {
      return !udp.acceptsNullValue();
    }
  }

  Statistics<T> stats = columnChunk.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(columnChunk)) {
    // lets run the udp with null value to see if it keeps null or not.
    if (inverted) {
      return udp.acceptsNullValue();
    } else {
      return !udp.acceptsNullValue();
    }
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
    new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(),
      stats.comparator());

  if (inverted) {
    return udp.inverseCanDrop(udpStats);
  } else {
    return udp.canDrop(udpStats);
  }
}