org.apache.parquet.filter2.predicate.FilterPredicate Java Examples

The following examples show how to use org.apache.parquet.filter2.predicate.FilterPredicate. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestBloomFiltering.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void assertCorrectFiltering(Predicate<PhoneBookWriter.User> expectedFilter, FilterPredicate actualFilter)
  throws IOException {
  // Check with only bloom filter based filtering
  List<PhoneBookWriter.User> result = readUsers(actualFilter, false, true);

  assertTrue("Bloom filtering should drop some row groups", result.size() < DATA.size());
  LOGGER.info("{}/{} records read; filtering ratio: {}%", result.size(), DATA.size(),
    100 * result.size() / DATA.size());
  // Asserts that all the required records are in the result
  assertContains(DATA.stream().filter(expectedFilter), result);
  // Asserts that all the retrieved records are in the file (validating non-matching records)
  assertContains(result.stream(), DATA);

  // Check with all the filtering filtering to ensure the result contains exactly the required values
  result = readUsers(actualFilter, true, false);
  assertEquals(DATA.stream().filter(expectedFilter).collect(Collectors.toList()), result);
}
 
Example #2
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testClearExceptionForNots() {
  List<ColumnChunkMetaData> columnMetas = Arrays.asList(
      getDoubleColumnMeta(new DoubleStatistics(), 0L),
      getIntColumnMeta(new IntStatistics(), 0L));

  FilterPredicate pred = and(not(eq(doubleColumn, 12.0)), eq(intColumn, 17));

  try {
    canDrop(pred, columnMetas);
    fail("This should throw");
  } catch (IllegalArgumentException e) {
    assertEquals("This predicate contains a not! Did you forget to run this predicate through LogicalInverseRewriter?"
        + " not(eq(double.column, 12.0))", e.getMessage());
  }
}
 
Example #3
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static
<C extends Comparable<C>, COL extends Operators.Column<C> & Operators.SupportsLtGt>
FilterPredicate pred(Operation op, COL col, C value) {
  switch (op) {
    case IS_NULL:
      return FilterApi.eq(col, null);
    case NOT_NULL:
      return FilterApi.notEq(col, null);
    case EQ:
      return FilterApi.eq(col, value);
    case NOT_EQ:
      return FilterApi.notEq(col, value);
    case GT:
      return FilterApi.gt(col, value);
    case GT_EQ:
      return FilterApi.gtEq(col, value);
    case LT:
      return FilterApi.lt(col, value);
    case LT_EQ:
      return FilterApi.ltEq(col, value);
    default:
      throw new UnsupportedOperationException("Unsupported predicate operation: " + op);
  }
}
 
Example #4
Source File: ParquetTableSource.java    From flink with Apache License 2.0 6 votes vote down vote up
@Nullable
private FilterPredicate lessThan(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof LessThan, "exp has to be LessThan");

	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.lt((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.lt((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.lt((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.lt((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #5
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void assertCorrectFiltering(Predicate<User> expectedFilter, FilterPredicate actualFilter)
    throws IOException {
  // Check with only column index based filtering
  List<User> result = readUsers(actualFilter, false);

  assertTrue("Column-index filtering should drop some pages", result.size() < DATA.size());
  LOGGER.info("{}/{} records read; filtering ratio: {}%", result.size(), DATA.size(),
      100 * result.size() / DATA.size());
  // Asserts that all the required records are in the result
  assertContains(DATA.stream().filter(expectedFilter), result);
  // Asserts that all the retrieved records are in the file (validating non-matching records)
  assertContains(result.stream(), DATA);

  // Check with all the filtering filtering to ensure the result contains exactly the required values
  result = readUsers(actualFilter, true);
  assertEquals(DATA.stream().filter(expectedFilter).collect(Collectors.toList()), result);
}
 
Example #6
Source File: TestRecordLevelFilters.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testUserDefinedByInstance() throws Exception {
  LongColumn name = longColumn("id");

  final HashSet<Long> h = new HashSet<Long>();
  h.add(20L); 
  h.add(27L);
  h.add(28L);
  
  FilterPredicate pred = userDefined(name, new SetInFilter(h));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));

  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return u != null && h.contains(u.getId());
    }
  });
}
 
Example #7
Source File: ParquetRecordFilterBuilder.java    From pxf with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the FilterPredicate function that supports equals and not equals
 * for the given operator
 *
 * @param operator the operator
 * @param <T>      the type
 * @param <C>      the column type
 * @return the FilterPredicate function
 */
private static <T extends Comparable<T>, C extends Operators.Column<T> & Operators.SupportsEqNotEq> BiFunction<C, T, FilterPredicate> getOperatorWithEqNotEqSupport(Operator operator) {
    switch (operator) {
        case IS_NULL:
        case EQUALS:
        case NOOP:
            return FilterApi::eq;
        // NOT boolean wraps a NOOP
        //       NOT
        //        |
        //       NOOP
        //        |
        //    ---------
        //   |         |
        //   4        true
        // that needs to be replaced with equals
        case IS_NOT_NULL:
        case NOT_EQUALS:
            return FilterApi::notEq;

        default:
            throw new UnsupportedOperationException("not supported " + operator);
    }
}
 
Example #8
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testAnd() throws Exception {
  BinaryColumn col = binaryColumn("binary_field");

  // both evaluate to false (no upper-case letters are in the dictionary)
  FilterPredicate B = eq(col, Binary.fromString("B"));
  FilterPredicate C = eq(col, Binary.fromString("C"));

  // both evaluate to true (all lower-case letters are in the dictionary)
  FilterPredicate x = eq(col, Binary.fromString("x"));
  FilterPredicate y = eq(col, Binary.fromString("y"));

  assertTrue("Should drop when either predicate must be false",
      canDrop(and(B, y), ccmd, dictionaries));
  assertTrue("Should drop when either predicate must be false",
      canDrop(and(x, C), ccmd, dictionaries));
  assertTrue("Should drop when either predicate must be false",
      canDrop(and(B, C), ccmd, dictionaries));
  assertFalse("Should not drop when either predicate could be true",
      canDrop(and(x, y), ccmd, dictionaries));
}
 
Example #9
Source File: ParquetTableSource.java    From flink with Apache License 2.0 6 votes vote down vote up
@Nullable
private FilterPredicate lessThan(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof LessThan, "exp has to be LessThan");

	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.lt((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.lt((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.lt((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.lt((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #10
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testInverseUdp() throws Exception {
  InInt32UDP droppable = new InInt32UDP(ImmutableSet.of(42));
  InInt32UDP undroppable = new InInt32UDP(ImmutableSet.of(205));
  Set<Integer> allValues = ImmutableSet.copyOf(Ints.asList(intValues));
  InInt32UDP completeMatch = new InInt32UDP(allValues);

  FilterPredicate inverse =
    LogicalInverseRewriter.rewrite(not(userDefined(intColumn("int32_field"), droppable)));
  FilterPredicate inverse1 =
    LogicalInverseRewriter.rewrite(not(userDefined(intColumn("int32_field"), undroppable)));
  FilterPredicate inverse2 =
    LogicalInverseRewriter.rewrite(not(userDefined(intColumn("int32_field"), completeMatch)));

  assertFalse("Should not drop block for inverse of non-matching UDP",
    canDrop(inverse, ccmd, dictionaries));

  assertFalse("Should not drop block for inverse of UDP with some matches",
    canDrop(inverse1, ccmd, dictionaries));

  assertTrue("Should drop block for inverse of UDP with all matches",
    canDrop(inverse2, ccmd, dictionaries));
}
 
Example #11
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testOr() throws Exception {
  BinaryColumn col = binaryColumn("binary_field");

  // both evaluate to false (no upper-case letters are in the dictionary)
  FilterPredicate B = eq(col, Binary.fromString("B"));
  FilterPredicate C = eq(col, Binary.fromString("C"));

  // both evaluate to true (all lower-case letters are in the dictionary)
  FilterPredicate x = eq(col, Binary.fromString("x"));
  FilterPredicate y = eq(col, Binary.fromString("y"));

  assertFalse("Should not drop when one predicate could be true",
      canDrop(or(B, y), ccmd, dictionaries));
  assertFalse("Should not drop when one predicate could be true",
      canDrop(or(x, C), ccmd, dictionaries));
  assertTrue("Should drop when both predicates must be false",
      canDrop(or(B, C), ccmd, dictionaries));
  assertFalse("Should not drop when one predicate could be true",
      canDrop(or(x, y), ccmd, dictionaries));
}
 
Example #12
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public FilterPredicate or(FilterPredicate left, FilterPredicate right) {
  if (left == AlwaysTrue.INSTANCE || right == AlwaysTrue.INSTANCE) {
    return AlwaysTrue.INSTANCE;
  } else if (left == AlwaysFalse.INSTANCE) {
    return right;
  } else if (right == AlwaysFalse.INSTANCE) {
    return left;
  }
  return FilterApi.or(left, right);
}
 
Example #13
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T> FilterPredicate predicate(UnboundPredicate<T> pred) {
  Expression bound = bind(pred);
  if (bound instanceof BoundPredicate) {
    return predicate((BoundPredicate<?>) bound);
  } else if (bound == Expressions.alwaysTrue()) {
    return AlwaysTrue.INSTANCE;
  } else if (bound == Expressions.alwaysFalse()) {
    return AlwaysFalse.INSTANCE;
  }
  throw new UnsupportedOperationException("Cannot convert to Parquet filter: " + pred);
}
 
Example #14
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public TableSource<Row> applyPredicate(List<Expression> predicates) {

	// try to convert Flink filter expressions to Parquet FilterPredicates
	List<FilterPredicate> convertedPredicates = new ArrayList<>(predicates.size());
	List<Expression> unsupportedExpressions = new ArrayList<>(predicates.size());

	for (Expression toConvert : predicates) {
		FilterPredicate convertedPredicate = toParquetPredicate(toConvert);
		if (convertedPredicate != null) {
			convertedPredicates.add(convertedPredicate);
		} else {
			unsupportedExpressions.add(toConvert);
		}
	}

	// update list of Flink expressions to unsupported expressions
	predicates.clear();
	predicates.addAll(unsupportedExpressions);

	// construct single Parquet FilterPredicate
	FilterPredicate parquetPredicate = null;
	if (!convertedPredicates.isEmpty()) {
		// concat converted predicates with AND
		parquetPredicate = convertedPredicates.get(0);

		for (FilterPredicate converted : convertedPredicates.subList(1, convertedPredicates.size())) {
			parquetPredicate = FilterApi.and(parquetPredicate, converted);
		}
	}

	// create and return a new ParquetTableSource with Parquet FilterPredicate
	return new ParquetTableSource(path, parquetSchema, this.parquetConfig, recursiveEnumeration, selectedFields, parquetPredicate);
}
 
Example #15
Source File: TestRecordLevelFilters.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNameNotStartWithP() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = not(userDefined(name, StartWithP.class));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));

  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return u.getName() == null || !u.getName().startsWith("p");
    }
  });
}
 
Example #16
Source File: ParquetLoader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private FilterPredicate buildFilter(OpType op, Column col, Const value) {
  String name = col.getName();
  try {
    FieldSchema f = schema.getField(name);
    switch (f.type) {
      case DataType.BOOLEAN:
        Operators.BooleanColumn boolCol = booleanColumn(name);
        switch(op) {
          case OP_EQ: return eq(boolCol, getValue(value, boolCol.getColumnType()));
          case OP_NE: return notEq(boolCol, getValue(value, boolCol.getColumnType()));
          default: throw new RuntimeException(
              "Operation " + op + " not supported for boolean column: " + name);
        }
      case DataType.INTEGER:
        Operators.IntColumn intCol = intColumn(name);
        return op(op, intCol, value);
      case DataType.LONG:
        Operators.LongColumn longCol = longColumn(name);
        return op(op, longCol, value);
      case DataType.FLOAT:
        Operators.FloatColumn floatCol = floatColumn(name);
        return op(op, floatCol, value);
      case DataType.DOUBLE:
        Operators.DoubleColumn doubleCol = doubleColumn(name);
        return op(op, doubleCol, value);
      case DataType.CHARARRAY:
        Operators.BinaryColumn binaryCol = binaryColumn(name);
        return op(op, binaryCol, value);
      default:
        throw new RuntimeException("Unsupported type " + f.type + " for field: " + name);
    }
  } catch (FrontendException e) {
    throw new RuntimeException("Error processing pushdown for column:" + col, e);
  }
}
 
Example #17
Source File: TestRecordLevelFilters.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testAllFilter() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = eq(name, Binary.fromString("no matches"));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));
  assertEquals(new ArrayList<Group>(), found);
}
 
Example #18
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static FilterCompat.Filter convert(Schema schema, Expression expr, boolean caseSensitive) {
  FilterPredicate pred = ExpressionVisitors.visit(expr, new ConvertFilterToParquet(schema, caseSensitive));
  // TODO: handle AlwaysFalse.INSTANCE
  if (pred != null && pred != AlwaysTrue.INSTANCE) {
    // FilterCompat will apply LogicalInverseRewriter
    return FilterCompat.get(pred);
  } else {
    return FilterCompat.NOOP;
  }
}
 
Example #19
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate lessThanOrEqual(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof LessThanOrEqual, "exp has to be LessThanOrEqual");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.ltEq((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.ltEq((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.ltEq((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.ltEq((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #20
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
private ParquetTableSource(String path, MessageType parquetSchema, Configuration configuration,
								boolean recursiveEnumeration, @Nullable int[] selectedFields, @Nullable FilterPredicate predicate) {
	Preconditions.checkNotNull(path, "Path must not be null.");
	Preconditions.checkNotNull(parquetSchema, "ParquetSchema must not be null.");
	Preconditions.checkNotNull(configuration, "Configuration must not be null");
	this.path = path;
	this.parquetSchema = parquetSchema;
	this.parquetConfig = configuration;
	this.selectedFields = selectedFields;
	this.predicate = predicate;
	this.recursiveEnumeration = recursiveEnumeration;

	if (predicate != null) {
		this.isFilterPushedDown = true;
	}
	// determine the type information from the Parquet schema
	RowTypeInfo typeInfoFromSchema = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(parquetSchema);

	// set return type info
	if (selectedFields == null) {
		this.typeInfo = typeInfoFromSchema;
	} else {
		this.typeInfo = RowTypeInfo.projectFields(typeInfoFromSchema, selectedFields);
	}

	// create a TableSchema that corresponds to the Parquet schema
	this.tableSchema = new TableSchema(
		typeInfoFromSchema.getFieldNames(),
		typeInfoFromSchema.getFieldTypes()
	);
}
 
Example #21
Source File: ParquetLoader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void setInput(String location, Job job) throws IOException {
  this.setLocationHasBeenCalled  = true;
  this.location = location;
  setInputPaths(job, location);

  //This is prior to load because the initial value comes from the constructor
  //not file metadata or pig framework and would get overwritten in initSchema().
  if(UDFContext.getUDFContext().isFrontend()) {
    storeInUDFContext(PARQUET_COLUMN_INDEX_ACCESS, Boolean.toString(columnIndexAccess));
  }

  schema = PigSchemaConverter.parsePigSchema(getPropertyFromUDFContext(PARQUET_PIG_SCHEMA));
  requiredFieldList = PigSchemaConverter.deserializeRequiredFieldList(getPropertyFromUDFContext(PARQUET_PIG_REQUIRED_FIELDS));
  columnIndexAccess = Boolean.parseBoolean(getPropertyFromUDFContext(PARQUET_COLUMN_INDEX_ACCESS));

  initSchema(job);

  if(UDFContext.getUDFContext().isFrontend()) {
    //Setting for task-side loading via initSchema()
    storeInUDFContext(PARQUET_PIG_SCHEMA, pigSchemaToString(schema));
    storeInUDFContext(PARQUET_PIG_REQUIRED_FIELDS, serializeRequiredFieldList(requiredFieldList));
  }

  //Used by task-side loader via TupleReadSupport
  getConfiguration(job).set(PARQUET_PIG_SCHEMA, pigSchemaToString(schema));
  getConfiguration(job).set(PARQUET_PIG_REQUIRED_FIELDS, serializeRequiredFieldList(requiredFieldList));
  getConfiguration(job).set(PARQUET_COLUMN_INDEX_ACCESS, Boolean.toString(columnIndexAccess));

  FilterPredicate filterPredicate = (FilterPredicate) getFromUDFContext(ParquetInputFormat.FILTER_PREDICATE);
  if(filterPredicate != null) {
    ParquetInputFormat.setFilterPredicate(getConfiguration(job), filterPredicate);
  }
}
 
Example #22
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate greaterThan(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof GreaterThan, "exp has to be GreaterThan");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.gt((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.gt((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.gt((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.gt((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #23
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate greaterThanOrEqual(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof GreaterThanOrEqual, "exp has to be GreaterThanOrEqual");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.gtEq((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.gtEq((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.gtEq((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.gtEq((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #24
Source File: TestBloomFiltering.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<PhoneBookWriter.User> readUsers(FilterPredicate filter, boolean useOtherFiltering,
                                             boolean useBloomFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
    .withFilter(FilterCompat.get(filter))
    .useDictionaryFilter(useOtherFiltering)
    .useStatsFilter(useOtherFiltering)
    .useRecordFilter(useOtherFiltering)
    .useBloomFilter(useBloomFilter)
    .useColumnIndexFilter(useOtherFiltering));
}
 
Example #25
Source File: RowGroupFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) {
  FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate();

  // check that the schema of the filter matches the schema of the file
  SchemaCompatibilityValidator.validate(filterPredicate, schema);

  List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>();

  for (BlockMetaData block : blocks) {
    boolean drop = false;

    if(levels.contains(FilterLevel.STATISTICS)) {
      drop = StatisticsFilter.canDrop(filterPredicate, block.getColumns());
    }

    if(!drop && levels.contains(FilterLevel.DICTIONARY)) {
      drop = DictionaryFilter.canDrop(filterPredicate, block.getColumns(), reader.getDictionaryReader(block));
    }

    if (!drop && levels.contains(FilterLevel.BLOOMFILTER)) {
      drop = BloomFilterImpl.canDrop(filterPredicate, block.getColumns(), reader.getBloomFilterDataReader(block));
    }

    if(!drop) {
      filteredBlocks.add(block);
    }
  }

  return filteredBlocks;
}
 
Example #26
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static FilterPredicate getFilterPredicate(Configuration configuration) {
  try {
    return SerializationUtil.readObjectFromConfAsBase64(FILTER_PREDICATE, configuration);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example #27
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static void setFilterPredicate(Configuration configuration, FilterPredicate filterPredicate) {
  checkArgument(getUnboundRecordFilter(configuration) == null,
      "You cannot provide a FilterPredicate after providing an UnboundRecordFilter");

  configuration.set(FILTER_PREDICATE + ".human.readable", filterPredicate.toString());
  try {
    SerializationUtil.writeObjectToConfAsBase64(FILTER_PREDICATE, filterPredicate, configuration);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example #28
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void benchmark(Blackhole blackhole, BaseContext context) throws Exception {
  FilterPredicate filter = FilterApi.eq(BaseContext.COLUMN, context.getRandom().nextLong());
  try (ParquetReader<Group> reader = context.createReaderBuilder()
      .withFilter(FilterCompat.get(filter))
      .build()) {
    blackhole.consume(reader.read());
  }
}
 
Example #29
Source File: FilterCompat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Given a FilterPredicate, return a Filter that wraps it.
 * This method also logs the filter being used and rewrites
 * the predicate to not include the not() operator.
 *
 * @param filterPredicate a filter predicate
 * @return a filter for the given predicate
 */
public static Filter get(FilterPredicate filterPredicate) {
  Objects.requireNonNull(filterPredicate, "filterPredicate cannot be null");

  LOG.info("Filtering using predicate: {}", filterPredicate);

  // rewrite the predicate to not include the not() operator
  FilterPredicate collapsedPredicate = LogicalInverseRewriter.rewrite(filterPredicate);

  if (!filterPredicate.equals(collapsedPredicate)) {
    LOG.info("Predicate has been collapsed to: {}", collapsedPredicate);
  }

  return new FilterPredicateCompat(collapsedPredicate);
}
 
Example #30
Source File: FilterCompat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Given either a FilterPredicate or the class of an UnboundRecordFilter, or neither (but not both)
 * return a Filter that wraps whichever was provided.
 * <p>
 * Either filterPredicate or unboundRecordFilterClass must be null, or an exception is thrown.
 * <p>
 * If both are null, the no op filter will be returned.
 *
 * @param filterPredicate a filter predicate, or null
 * @param unboundRecordFilter an unbound record filter, or null
 * @return a Filter wrapping either the predicate or the unbound record filter (from the old API)
 */
public static Filter get(FilterPredicate filterPredicate, UnboundRecordFilter unboundRecordFilter) {
  checkArgument(filterPredicate == null || unboundRecordFilter == null,
      "Cannot provide both a FilterPredicate and an UnboundRecordFilter");

  if (filterPredicate != null) {
    return get(filterPredicate);
  }

  if (unboundRecordFilter != null) {
    return get(unboundRecordFilter);
  }

  return NOOP;
}