org.apache.parquet.column.statistics.IntStatistics Java Examples

The following examples show how to use org.apache.parquet.column.statistics.IntStatistics. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TupleDomainParquetPredicate.java    From presto with Apache License 2.0 6 votes vote down vote up
private static Optional<ParquetIntegerStatistics> toParquetIntegerStatistics(Statistics<?> statistics, ParquetDataSourceId id, String column, boolean failOnCorruptedParquetStatistics)
        throws ParquetCorruptionException
{
    if (statistics instanceof LongStatistics) {
        LongStatistics longStatistics = (LongStatistics) statistics;
        if (longStatistics.genericGetMin() > longStatistics.genericGetMax()) {
            failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, longStatistics);
            return Optional.empty();
        }
        return Optional.of(new ParquetIntegerStatistics(longStatistics.genericGetMin(), longStatistics.genericGetMax()));
    }

    if (statistics instanceof IntStatistics) {
        IntStatistics intStatistics = (IntStatistics) statistics;
        if (intStatistics.genericGetMin() > intStatistics.genericGetMax()) {
            failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, intStatistics);
            return Optional.empty();
        }
        return Optional.of(new ParquetIntegerStatistics((long) intStatistics.getMin(), (long) intStatistics.getMax()));
    }

    throw new IllegalArgumentException("Cannot convert statistics of type " + statistics.getClass().getName());
}
 
Example #2
Source File: TestMetadataReader.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test(dataProvider = "allCreatedBy")
public void testReadStatsInt32(Optional<String> fileCreatedBy)
{
    Statistics statistics = new Statistics();
    statistics.setNull_count(13);
    statistics.setMin(fromHex("F6FFFFFF"));
    statistics.setMax(fromHex("3AA40000"));
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.of(statistics), new PrimitiveType(OPTIONAL, INT32, "Test column")))
            .isInstanceOfSatisfying(IntStatistics.class, columnStatistics -> {
                assertEquals(columnStatistics.getNumNulls(), 13);
                assertEquals(columnStatistics.getMin(), -10);
                assertEquals(columnStatistics.getMax(), 42042);
                assertEquals(columnStatistics.genericGetMin(), (Integer) (int) -10);
                assertEquals(columnStatistics.genericGetMax(), (Integer) 42042);
            });
}
 
Example #3
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testEqNull() {
  IntStatistics statsNoNulls = new IntStatistics();
  statsNoNulls.setMinMax(10, 100);
  statsNoNulls.setNumNulls(0);

  IntStatistics statsSomeNulls = new IntStatistics();
  statsSomeNulls.setMinMax(10, 100);
  statsSomeNulls.setNumNulls(3);

  assertTrue(canDrop(eq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsNoNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(eq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsSomeNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(eq(missingColumn, null), columnMetas));

  assertFalse(canDrop(eq(intColumn, null), missingMinMaxColumnMetas));
  assertFalse(canDrop(eq(doubleColumn, null), missingMinMaxColumnMetas));
}
 
Example #4
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testClearExceptionForNots() {
  List<ColumnChunkMetaData> columnMetas = Arrays.asList(
      getDoubleColumnMeta(new DoubleStatistics(), 0L),
      getIntColumnMeta(new IntStatistics(), 0L));

  FilterPredicate pred = and(not(eq(doubleColumn, 12.0)), eq(intColumn, 17));

  try {
    canDrop(pred, columnMetas);
    fail("This should throw");
  } catch (IllegalArgumentException e) {
    assertEquals("This predicate contains a not! Did you forget to run this predicate through LogicalInverseRewriter?"
        + " not(eq(double.column, 12.0))", e.getMessage());
  }
}
 
Example #5
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void testIntegerStats(StatsHelper helper) {
  // make fake stats and verify the size check
  IntStatistics stats = new IntStatistics();
  stats.incrementNumNulls(3004);
  int min = Integer.MIN_VALUE;
  int max = Integer.MAX_VALUE;
  stats.updateStats(min);
  stats.updateStats(max);

  org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);

  Assert.assertEquals("Min should match",
      min, BytesUtils.bytesToInt(formatStats.getMin()));
  Assert.assertEquals("Max should match",
      max, BytesUtils.bytesToInt(formatStats.getMax()));
  Assert.assertEquals("Num nulls should match",
      3004, formatStats.getNull_count());
}
 
Example #6
Source File: TestMetadataReader.java    From presto with Apache License 2.0 5 votes vote down vote up
@Test(dataProvider = "allCreatedBy")
public void testReadNullStats(Optional<String> fileCreatedBy)
{
    // integer
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT32, "Test column")))
            .isInstanceOfSatisfying(
                    IntStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // bigint
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT64, "Test column")))
            .isInstanceOfSatisfying(
                    LongStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // varchar
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8)))
            .isInstanceOfSatisfying(
                    BinaryStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // varbinary
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column")))
            .isInstanceOfSatisfying(
                    BinaryStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));
}
 
Example #7
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
  BlockMetaData blockMetaData = new BlockMetaData();

  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      100l, 100l, valueCount, 100l, 100l);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(200l);
  blockMetaData.setRowCount(valueCount);
  return blockMetaData;
}
 
Example #8
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNotEqNonNull() {
  assertFalse(canDrop(notEq(intColumn, 9), columnMetas));
  assertFalse(canDrop(notEq(intColumn, 10), columnMetas));
  assertFalse(canDrop(notEq(intColumn, 100), columnMetas));
  assertFalse(canDrop(notEq(intColumn, 101), columnMetas));

  IntStatistics allSevens = new IntStatistics();
  allSevens.setMinMax(7, 7);
  assertTrue(canDrop(notEq(intColumn, 7), Arrays.asList(
      getIntColumnMeta(allSevens, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  allSevens.setNumNulls(100L);
  assertFalse(canDrop(notEq(intColumn, 7), Arrays.asList(
      getIntColumnMeta(allSevens, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  allSevens.setNumNulls(177L);
  assertFalse(canDrop(notEq(intColumn, 7), Arrays.asList(
      getIntColumnMeta(allSevens, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas));

  assertFalse(canDrop(notEq(intColumn, 50), missingMinMaxColumnMetas));
  assertFalse(canDrop(notEq(doubleColumn, 50.0), missingMinMaxColumnMetas));
}
 
Example #9
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNotEqNull() {
  IntStatistics statsNoNulls = new IntStatistics();
  statsNoNulls.setMinMax(10, 100);
  statsNoNulls.setNumNulls(0);

  IntStatistics statsSomeNulls = new IntStatistics();
  statsSomeNulls.setMinMax(10, 100);
  statsSomeNulls.setNumNulls(3);

  IntStatistics statsAllNulls = new IntStatistics();
  statsAllNulls.setMinMax(0, 0);
  statsAllNulls.setNumNulls(177);

  assertFalse(canDrop(notEq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsNoNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(notEq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsSomeNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(notEq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsAllNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(notEq(missingColumn, null), columnMetas));

  assertFalse(canDrop(notEq(intColumn, null), missingMinMaxColumnMetas));
  assertFalse(canDrop(notEq(doubleColumn, null), missingMinMaxColumnMetas));
}
 
Example #10
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Detect corrupt date values by looking at the min/max values in the metadata.
 *
 * This should only be used when a file does not have enough metadata to determine if
 * the data was written with an external tool or an older version of Drill
 * ({@link org.apache.drill.exec.store.parquet.ParquetRecordWriter#WRITER_VERSION_PROPERTY} <
 * {@link org.apache.drill.exec.store.parquet.ParquetReaderUtility#DRILL_WRITER_VERSION_STD_DATE_FORMAT})
 *
 * This method only checks the first Row Group, because Drill has only ever written
 * a single Row Group per file.
 *
 * @param footer parquet footer
 * @param columns list of columns schema path
 * @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
 *                                of corrupt dates. There are some rare cases (storing dates thousands
 *                                of years into the future, with tools other than Drill writing files)
 *                                that would result in the date values being "corrected" into bad values.
 */
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
                                                            List<SchemaPath> columns,
                                                            boolean autoCorrectCorruptDates) {
  // Users can turn-off date correction in cases where we are detecting corruption based on the date values
  // that are unlikely to appear in common datasets. In this case report that no correction needs to happen
  // during the file read
  if (! autoCorrectCorruptDates) {
    return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
  }
  // Drill produced files have only ever have a single row group, if this changes in the future it won't matter
  // as we will know from the Drill version written in the files that the dates are correct
  int rowGroupIndex = 0;
  Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
  findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
    List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
    for (int i = 0; i < parquetColumns.size(); ++i) {
      ColumnDescriptor column = parquetColumns.get(i);
      // this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
      // creating a NameSegment makes sure we are using the standard code for comparing names,
      // currently it is all case-insensitive
      if (Utilities.isStarQuery(columns)
          || getFullColumnPath(column).equalsIgnoreCase(schemaPath.getUnIndexed().toString())) {
        int colIndex = -1;
        ConvertedType convertedType = schemaElements.get(getFullColumnPath(column)).getConverted_type();
        if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
          List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
          for (int j = 0; j < colChunkList.size(); j++) {
            if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
              colIndex = j;
              break;
            }
          }
        }
        if (colIndex == -1) {
          // column does not appear in this file, skip it
          continue;
        }
        IntStatistics statistics = (IntStatistics) footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
        return (statistics.hasNonNullValue() && statistics.compareMaxToValue(ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) > 0) ?
            DateCorruptionStatus.META_SHOWS_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
      }
    }
  }
  return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
 
Example #11
Source File: TestTupleDomainParquetPredicate.java    From presto with Apache License 2.0 4 votes vote down vote up
private static IntStatistics intColumnStats(int minimum, int maximum)
{
    IntStatistics statistics = new IntStatistics();
    statistics.setMinMax(minimum, maximum);
    return statistics;
}
 
Example #12
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void testUdp() {
  FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
  FilterPredicate invPred = LogicalInverseRewriter.rewrite(not(userDefined(intColumn, SevensAndEightsUdp.class)));

  FilterPredicate udpDropMissingColumn = userDefined(missingColumn2, DropNullUdp.class);
  FilterPredicate invUdpDropMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, DropNullUdp.class)));

  FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
  FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class)));

  FilterPredicate allPositivePred = userDefined(doubleColumn, AllPositiveUdp.class);

  IntStatistics seven = new IntStatistics();
  seven.setMinMax(7, 7);

  IntStatistics eight = new IntStatistics();
  eight.setMinMax(8, 8);

  IntStatistics neither = new IntStatistics();
  neither.setMinMax(1 , 2);

  assertTrue(canDrop(pred, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(pred, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(pred, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(invPred, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(invPred, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(invPred, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  // udpDropMissingColumn drops null column.
  assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  // invUdpDropMissingColumn (i.e., not(udpDropMissingColumn)) keeps null column.
  assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  // udpKeepMissingColumn keeps null column.
  assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  // invUdpKeepMissingColumn (i.e., not(udpKeepMissingColumn)) drops null column.
  assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(allPositivePred, missingMinMaxColumnMetas));
}
 
Example #13
Source File: TestRowGroupFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void testApplyRowGroupFilters() {

  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  IntStatistics stats1 = new IntStatistics();
  stats1.setMinMax(10, 100);
  stats1.setNumNulls(4);
  BlockMetaData b1 = makeBlockFromStats(stats1, 301);
  blocks.add(b1);

  IntStatistics stats2 = new IntStatistics();
  stats2.setMinMax(8, 102);
  stats2.setNumNulls(0);
  BlockMetaData b2 = makeBlockFromStats(stats2, 302);
  blocks.add(b2);

  IntStatistics stats3 = new IntStatistics();
  stats3.setMinMax(100, 102);
  stats3.setNumNulls(12);
  BlockMetaData b3 = makeBlockFromStats(stats3, 303);
  blocks.add(b3);


  IntStatistics stats4 = new IntStatistics();
  stats4.setMinMax(0, 0);
  stats4.setNumNulls(304);
  BlockMetaData b4 = makeBlockFromStats(stats4, 304);
  blocks.add(b4);


  IntStatistics stats5 = new IntStatistics();
  stats5.setMinMax(50, 50);
  stats5.setNumNulls(7);
  BlockMetaData b5 = makeBlockFromStats(stats5, 305);
  blocks.add(b5);

  IntStatistics stats6 = new IntStatistics();
  stats6.setMinMax(0, 0);
  stats6.setNumNulls(12);
  BlockMetaData b6 = makeBlockFromStats(stats6, 306);
  blocks.add(b6);

  MessageType schema = MessageTypeParser.parseMessageType("message Document { optional int32 foo; }");
  IntColumn foo = intColumn("foo");

  List<BlockMetaData> filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, 50)), blocks, schema);
  assertEquals(Arrays.asList(b1, b2, b5), filtered);

  filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(notEq(foo, 50)), blocks, schema);
  assertEquals(Arrays.asList(b1, b2, b3, b4, b5, b6), filtered);

  filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, null)), blocks, schema);
  assertEquals(Arrays.asList(b1, b3, b4, b5, b6), filtered);

  filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(notEq(foo, null)), blocks, schema);
  assertEquals(Arrays.asList(b1, b2, b3, b5, b6), filtered);

  filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, 0)), blocks, schema);
  assertEquals(Arrays.asList(b6), filtered);
}