org.apache.parquet.filter2.predicate.FilterApi Java Examples

The following examples show how to use org.apache.parquet.filter2.predicate.FilterApi. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Uses a filter that drops all records to test handling of tasks (mappers) that need to do no work at all
 */
@Test
public void testReadWriteTaskSideMDAggressiveFilter() throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration();

  // this filter predicate should trigger row group filtering that drops all row-groups
  ParquetInputFormat.setFilterPredicate(conf, FilterApi.eq(FilterApi.intColumn("line"), -1000));
  final String fpString = conf.get(ParquetInputFormat.FILTER_PREDICATE);

  runMapReduceJob(CompressionCodecName.UNCOMPRESSED, new HashMap<String, String>() {{
    put("parquet.task.side.metadata", "true");
    put(ParquetInputFormat.FILTER_PREDICATE, fpString);
  }});

  File file = new File(outputPath.toString(), "part-m-00000");
  List<String> lines = Files.readAllLines(file.toPath(), StandardCharsets.UTF_8);
  assertTrue(lines.isEmpty());
}
 
Example #2
Source File: ParquetTableSource.java    From flink with Apache License 2.0 6 votes vote down vote up
@Nullable
private FilterPredicate lessThan(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof LessThan, "exp has to be LessThan");

	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.lt((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.lt((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.lt((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.lt((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #3
Source File: ParquetTableSource.java    From flink with Apache License 2.0 6 votes vote down vote up
@Nullable
private FilterPredicate lessThan(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof LessThan, "exp has to be LessThan");

	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.lt((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.lt((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.lt((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.lt((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #4
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static
<C extends Comparable<C>, COL extends Operators.Column<C> & Operators.SupportsLtGt>
FilterPredicate pred(Operation op, COL col, C value) {
  switch (op) {
    case IS_NULL:
      return FilterApi.eq(col, null);
    case NOT_NULL:
      return FilterApi.notEq(col, null);
    case EQ:
      return FilterApi.eq(col, value);
    case NOT_EQ:
      return FilterApi.notEq(col, value);
    case GT:
      return FilterApi.gt(col, value);
    case GT_EQ:
      return FilterApi.gtEq(col, value);
    case LT:
      return FilterApi.lt(col, value);
    case LT_EQ:
      return FilterApi.ltEq(col, value);
    default:
      throw new UnsupportedOperationException("Unsupported predicate operation: " + op);
  }
}
 
Example #5
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("checkstyle:MethodTypeParameterName")
private static <C extends Comparable<C>, COL extends Operators.Column<C> & Operators.SupportsLtGt>
    FilterPredicate pred(Operation op, COL col, C value) {
  switch (op) {
    case IS_NULL:
      return FilterApi.eq(col, null);
    case NOT_NULL:
      return FilterApi.notEq(col, null);
    case EQ:
      return FilterApi.eq(col, value);
    case NOT_EQ:
      return FilterApi.notEq(col, value);
    case GT:
      return FilterApi.gt(col, value);
    case GT_EQ:
      return FilterApi.gtEq(col, value);
    case LT:
      return FilterApi.lt(col, value);
    case LT_EQ:
      return FilterApi.ltEq(col, value);
    default:
      throw new UnsupportedOperationException("Unsupported predicate operation: " + op);
  }
}
 
Example #6
Source File: ParquetRecordFilterBuilder.java    From pxf with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the FilterPredicate function that supports equals and not equals
 * for the given operator
 *
 * @param operator the operator
 * @param <T>      the type
 * @param <C>      the column type
 * @return the FilterPredicate function
 */
private static <T extends Comparable<T>, C extends Operators.Column<T> & Operators.SupportsEqNotEq> BiFunction<C, T, FilterPredicate> getOperatorWithEqNotEqSupport(Operator operator) {
    switch (operator) {
        case IS_NULL:
        case EQUALS:
        case NOOP:
            return FilterApi::eq;
        // NOT boolean wraps a NOOP
        //       NOT
        //        |
        //       NOOP
        //        |
        //    ---------
        //   |         |
        //   4        true
        // that needs to be replaced with equals
        case IS_NOT_NULL:
        case NOT_EQUALS:
            return FilterApi::notEq;

        default:
            throw new UnsupportedOperationException("not supported " + operator);
    }
}
 
Example #7
Source File: ParquetRecordFilterBuilder.java    From pxf with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the FilterPredicate function that supports less than /
 * greater than for the given operator
 *
 * @param operator the operator
 * @param <T>      the type
 * @param <C>      the column type
 * @return the FilterPredicate function
 */
private static <T extends Comparable<T>, C extends Operators.Column<T> & Operators.SupportsLtGt> BiFunction<C, T, FilterPredicate> getOperatorWithLtGtSupport(Operator operator) {

    switch (operator) {
        case LESS_THAN:
            return FilterApi::lt;
        case GREATER_THAN:
            return FilterApi::gt;
        case LESS_THAN_OR_EQUAL:
            return FilterApi::ltEq;
        case GREATER_THAN_OR_EQUAL:
            return FilterApi::gtEq;
        default:
            return getOperatorWithEqNotEqSupport(operator);
    }
}
 
Example #8
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate greaterThan(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof GreaterThan, "exp has to be GreaterThan");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.gt((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.gt((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.gt((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.gt((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #9
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void benchmark(Blackhole blackhole, BaseContext context) throws Exception {
  FilterPredicate filter = FilterApi.eq(BaseContext.COLUMN, context.getRandom().nextLong());
  try (ParquetReader<Group> reader = context.createReaderBuilder()
      .withFilter(FilterCompat.get(filter))
      .build()) {
    blackhole.consume(reader.read());
  }
}
 
Example #10
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate lessThanOrEqual(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof LessThanOrEqual, "exp has to be LessThanOrEqual");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.ltEq((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.ltEq((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.ltEq((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.ltEq((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #11
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate greaterThanOrEqual(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof GreaterThanOrEqual, "exp has to be GreaterThanOrEqual");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.gtEq((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.gtEq((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.gtEq((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.gtEq((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #12
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate greaterThan(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof GreaterThan, "exp has to be GreaterThan");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.gt((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.gt((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.gt((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.gt((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #13
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public TableSource<Row> applyPredicate(List<Expression> predicates) {

	// try to convert Flink filter expressions to Parquet FilterPredicates
	List<FilterPredicate> convertedPredicates = new ArrayList<>(predicates.size());
	List<Expression> unsupportedExpressions = new ArrayList<>(predicates.size());

	for (Expression toConvert : predicates) {
		FilterPredicate convertedPredicate = toParquetPredicate(toConvert);
		if (convertedPredicate != null) {
			convertedPredicates.add(convertedPredicate);
		} else {
			unsupportedExpressions.add(toConvert);
		}
	}

	// update list of Flink expressions to unsupported expressions
	predicates.clear();
	predicates.addAll(unsupportedExpressions);

	// construct single Parquet FilterPredicate
	FilterPredicate parquetPredicate = null;
	if (!convertedPredicates.isEmpty()) {
		// concat converted predicates with AND
		parquetPredicate = convertedPredicates.get(0);

		for (FilterPredicate converted : convertedPredicates.subList(1, convertedPredicates.size())) {
			parquetPredicate = FilterApi.and(parquetPredicate, converted);
		}
	}

	// create and return a new ParquetTableSource with Parquet FilterPredicate
	return new ParquetTableSource(path, parquetSchema, this.parquetConfig, recursiveEnumeration, selectedFields, parquetPredicate);
}
 
Example #14
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public FilterPredicate or(FilterPredicate left, FilterPredicate right) {
  if (left == AlwaysTrue.INSTANCE || right == AlwaysTrue.INSTANCE) {
    return AlwaysTrue.INSTANCE;
  } else if (left == AlwaysFalse.INSTANCE) {
    return right;
  } else if (right == AlwaysFalse.INSTANCE) {
    return left;
  }
  return FilterApi.or(left, right);
}
 
Example #15
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public FilterPredicate and(FilterPredicate left, FilterPredicate right) {
  if (left == AlwaysFalse.INSTANCE || right == AlwaysFalse.INSTANCE) {
    return AlwaysFalse.INSTANCE;
  } else if (left == AlwaysTrue.INSTANCE) {
    return right;
  } else if (right == AlwaysTrue.INSTANCE) {
    return left;
  }
  return FilterApi.and(left, right);
}
 
Example #16
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public FilterPredicate not(FilterPredicate child) {
  if (child == AlwaysTrue.INSTANCE) {
    return AlwaysFalse.INSTANCE;
  } else if (child == AlwaysFalse.INSTANCE) {
    return AlwaysTrue.INSTANCE;
  }
  return FilterApi.not(child);
}
 
Example #17
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public FilterPredicate or(FilterPredicate left, FilterPredicate right) {
  if (left == AlwaysTrue.INSTANCE || right == AlwaysTrue.INSTANCE) {
    return AlwaysTrue.INSTANCE;
  } else if (left == AlwaysFalse.INSTANCE) {
    return right;
  } else if (right == AlwaysFalse.INSTANCE) {
    return left;
  }
  return FilterApi.or(left, right);
}
 
Example #18
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public FilterPredicate and(FilterPredicate left, FilterPredicate right) {
  if (left == AlwaysFalse.INSTANCE || right == AlwaysFalse.INSTANCE) {
    return AlwaysFalse.INSTANCE;
  } else if (left == AlwaysTrue.INSTANCE) {
    return right;
  } else if (right == AlwaysTrue.INSTANCE) {
    return left;
  }
  return FilterApi.and(left, right);
}
 
Example #19
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public FilterPredicate not(FilterPredicate child) {
  if (child == AlwaysTrue.INSTANCE) {
    return AlwaysFalse.INSTANCE;
  } else if (child == AlwaysFalse.INSTANCE) {
    return AlwaysTrue.INSTANCE;
  }
  return FilterApi.not(child);
}
 
Example #20
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private Tuple2<Column, Comparable> extractColumnAndLiteral(BinaryComparison comp) {
	TypeInformation<?> typeInfo = getLiteralType(comp);
	String columnName = getColumnName(comp);

	// fetch literal and ensure it is comparable
	Object value = getLiteral(comp);
	// validate that literal is comparable
	if (!(value instanceof Comparable)) {
		LOG.warn("Encountered a non-comparable literal of type {}." +
			"Cannot push predicate [{}] into ParquetTablesource." +
			"This is a bug and should be reported.", value.getClass().getCanonicalName(), comp);
		return null;
	}

	if (typeInfo == BasicTypeInfo.BYTE_TYPE_INFO ||
		typeInfo == BasicTypeInfo.SHORT_TYPE_INFO ||
		typeInfo == BasicTypeInfo.INT_TYPE_INFO) {
		return new Tuple2<>(FilterApi.intColumn(columnName), (Integer) value);
	} else if (typeInfo == BasicTypeInfo.LONG_TYPE_INFO) {
		return new Tuple2<>(FilterApi.longColumn(columnName), (Long) value);
	} else if (typeInfo == BasicTypeInfo.FLOAT_TYPE_INFO) {
		return new Tuple2<>(FilterApi.floatColumn(columnName), (Float) value);
	} else if (typeInfo == BasicTypeInfo.BOOLEAN_TYPE_INFO) {
		return new Tuple2<>(FilterApi.booleanColumn(columnName), (Boolean) value);
	} else if (typeInfo == BasicTypeInfo.DOUBLE_TYPE_INFO) {
		return new Tuple2<>(FilterApi.doubleColumn(columnName), (Double) value);
	} else if (typeInfo == BasicTypeInfo.STRING_TYPE_INFO) {
		return new Tuple2<>(FilterApi.binaryColumn(columnName), Binary.fromString((String) value));
	} else {
		// unsupported type
		return null;
	}
}
 
Example #21
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate lessThanOrEqual(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof LessThanOrEqual, "exp has to be LessThanOrEqual");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.ltEq((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.ltEq((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.ltEq((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.ltEq((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #22
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nullable
private FilterPredicate greaterThanOrEqual(Expression exp, Tuple2<Column, Comparable> columnPair) {
	Preconditions.checkArgument(exp instanceof GreaterThanOrEqual, "exp has to be GreaterThanOrEqual");
	if (columnPair.f0 instanceof IntColumn) {
		return FilterApi.gtEq((IntColumn) columnPair.f0, (Integer) columnPair.f1);
	} else if (columnPair.f0 instanceof LongColumn) {
		return FilterApi.gtEq((LongColumn) columnPair.f0, (Long) columnPair.f1);
	} else if (columnPair.f0 instanceof DoubleColumn) {
		return FilterApi.gtEq((DoubleColumn) columnPair.f0, (Double) columnPair.f1);
	} else if (columnPair.f0 instanceof FloatColumn) {
		return FilterApi.gtEq((FloatColumn) columnPair.f0, (Float) columnPair.f1);
	}

	return null;
}
 
Example #23
Source File: ParquetTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public TableSource<Row> applyPredicate(List<Expression> predicates) {

	// try to convert Flink filter expressions to Parquet FilterPredicates
	List<FilterPredicate> convertedPredicates = new ArrayList<>(predicates.size());
	List<Expression> unsupportedExpressions = new ArrayList<>(predicates.size());

	for (Expression toConvert : predicates) {
		FilterPredicate convertedPredicate = toParquetPredicate(toConvert);
		if (convertedPredicate != null) {
			convertedPredicates.add(convertedPredicate);
		} else {
			unsupportedExpressions.add(toConvert);
		}
	}

	// update list of Flink expressions to unsupported expressions
	predicates.clear();
	predicates.addAll(unsupportedExpressions);

	// construct single Parquet FilterPredicate
	FilterPredicate parquetPredicate = null;
	if (!convertedPredicates.isEmpty()) {
		// concat converted predicates with AND
		parquetPredicate = convertedPredicates.get(0);

		for (FilterPredicate converted : convertedPredicates.subList(1, convertedPredicates.size())) {
			parquetPredicate = FilterApi.and(parquetPredicate, converted);
		}
	}

	// create and return a new ParquetTableSource with Parquet FilterPredicate
	return new ParquetTableSource(path, parquetSchema, this.parquetConfig, recursiveEnumeration, selectedFields, parquetPredicate);
}
 
Example #24
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public <T> FilterPredicate predicate(BoundPredicate<T> pred) {
  if (!(pred.term() instanceof BoundReference)) {
    throw new UnsupportedOperationException("Cannot convert non-reference to Parquet filter: " + pred.term());
  }

  Operation op = pred.op();
  BoundReference<T> ref = (BoundReference<T>) pred.term();
  String path = schema.idToAlias(ref.fieldId());
  Literal<T> lit;
  if (pred.isUnaryPredicate()) {
    lit = null;
  } else if (pred.isLiteralPredicate()) {
    lit = pred.asLiteralPredicate().literal();
  } else {
    throw new UnsupportedOperationException("Cannot convert to Parquet filter: " + pred);
  }

  switch (ref.type().typeId()) {
    case BOOLEAN:
      Operators.BooleanColumn col = FilterApi.booleanColumn(path);
      switch (op) {
        case EQ:
          return FilterApi.eq(col, getParquetPrimitive(lit));
        case NOT_EQ:
          return FilterApi.notEq(col, getParquetPrimitive(lit));
      }
      break;
    case INTEGER:
    case DATE:
      return pred(op, FilterApi.intColumn(path), getParquetPrimitive(lit));
    case LONG:
    case TIME:
    case TIMESTAMP:
      return pred(op, FilterApi.longColumn(path), getParquetPrimitive(lit));
    case FLOAT:
      return pred(op, FilterApi.floatColumn(path), getParquetPrimitive(lit));
    case DOUBLE:
      return pred(op, FilterApi.doubleColumn(path), getParquetPrimitive(lit));
    case STRING:
    case UUID:
    case FIXED:
    case BINARY:
    case DECIMAL:
      return pred(op, FilterApi.binaryColumn(path), getParquetPrimitive(lit));
  }

  throw new UnsupportedOperationException("Cannot convert to Parquet filter: " + pred);
}
 
Example #25
Source File: ParquetFilters.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public <T> FilterPredicate predicate(BoundPredicate<T> pred) {
  Operation op = pred.op();
  BoundReference<T> ref = pred.ref();
  Literal<T> lit = pred.literal();
  String path = schema.idToAlias(ref.fieldId());

  switch (ref.type().typeId()) {
    case BOOLEAN:
      Operators.BooleanColumn col = FilterApi.booleanColumn(schema.idToAlias(ref.fieldId()));
      switch (op) {
        case EQ:
          return FilterApi.eq(col, getParquetPrimitive(lit));
        case NOT_EQ:
          return FilterApi.eq(col, getParquetPrimitive(lit));
      }

    case INTEGER:
      return pred(op, FilterApi.intColumn(path), getParquetPrimitive(lit));
    case LONG:
      return pred(op, FilterApi.longColumn(path), getParquetPrimitive(lit));
    case FLOAT:
      return pred(op, FilterApi.floatColumn(path), getParquetPrimitive(lit));
    case DOUBLE:
      return pred(op, FilterApi.doubleColumn(path), getParquetPrimitive(lit));
    case DATE:
      return pred(op, FilterApi.intColumn(path), getParquetPrimitive(lit));
    case TIME:
      return pred(op, FilterApi.longColumn(path), getParquetPrimitive(lit));
    case TIMESTAMP:
      return pred(op, FilterApi.longColumn(path), getParquetPrimitive(lit));
    case STRING:
      return pred(op, FilterApi.binaryColumn(path), getParquetPrimitive(lit));
    case UUID:
      return pred(op, FilterApi.binaryColumn(path), getParquetPrimitive(lit));
    case FIXED:
      return pred(op, FilterApi.binaryColumn(path), getParquetPrimitive(lit));
    case BINARY:
      return pred(op, FilterApi.binaryColumn(path), getParquetPrimitive(lit));
    case DECIMAL:
      return pred(op, FilterApi.binaryColumn(path), getParquetPrimitive(lit));
  }

  throw new UnsupportedOperationException("Cannot convert to Parquet filter: " + pred);
}
 
Example #26
Source File: ParquetTableSourceTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testFieldsFilter() throws Exception {
	ParquetTableSource parquetTableSource = createNestedTestParquetTableSource(testPath);

	// expressions for supported predicates
	Expression exp1 = new GreaterThan(
		new PlannerResolvedFieldReference("foo", Types.LONG),
		new Literal(100L, Types.LONG));
	Expression exp2 = new EqualTo(
		new Literal(100L, Types.LONG),
		new PlannerResolvedFieldReference("bar.spam", Types.LONG));

	// unsupported predicate
	Expression unsupported = new EqualTo(
		new GetCompositeField(
			new ItemAt(
				new PlannerResolvedFieldReference(
					"nestedArray",
					ObjectArrayTypeInfo.getInfoFor(
						Types.ROW_NAMED(new String[] {"type", "name"}, Types.STRING, Types.STRING))),
					new Literal(1, Types.INT)),
					"type"),
		new Literal("test", Types.STRING));
	// invalid predicate
	Expression invalidPred = new EqualTo(
		new PlannerResolvedFieldReference("nonField", Types.LONG),
		// some invalid, non-serializable, literal (here an object of this test class)
		new Literal(new ParquetTableSourceTest(), Types.LONG)
	);

	List<Expression> exps = new ArrayList<>();
	exps.add(exp1);
	exps.add(exp2);
	exps.add(unsupported);
	exps.add(invalidPred);

	// apply predict on TableSource
	ParquetTableSource filtered = (ParquetTableSource) parquetTableSource.applyPredicate(exps);

	// ensure copy is returned
	assertNotSame(parquetTableSource, filtered);

	// ensure table schema is identical
	assertEquals(parquetTableSource.getTableSchema(), filtered.getTableSchema());

	// ensure return type is identical
	assertEquals(NESTED_ROW_TYPE, filtered.getReturnType());

	// ensure source description is not the same
	assertNotEquals(parquetTableSource.explainSource(), filtered.explainSource());

	// check that pushdown was recorded
	assertTrue(filtered.isFilterPushedDown());
	assertFalse(parquetTableSource.isFilterPushedDown());

	// ensure that supported predicates were removed from list of offered expressions
	assertEquals(2, exps.size());
	assertTrue(exps.contains(unsupported));
	assertTrue(exps.contains(invalidPred));

	// ensure ParquetInputFormat is correctly configured with filter
	DataSet<Row> data = filtered.getDataSet(ExecutionEnvironment.createLocalEnvironment());
	InputFormat<Row, ?> inputFormat = ((DataSource<Row>) data).getInputFormat();
	assertTrue(inputFormat instanceof ParquetRowInputFormat);
	ParquetRowInputFormat parquetIF = (ParquetRowInputFormat) inputFormat;

	// expected predicate
	FilterPredicate a = FilterApi.gt(FilterApi.longColumn("foo"), 100L);
	FilterPredicate b = FilterApi.eq(FilterApi.longColumn("bar.spam"), 100L);
	FilterPredicate expected = FilterApi.and(a, b);
	// actual predicate
	FilterPredicate predicate = parquetIF.getPredicate();
	// check predicate
	assertEquals(expected, predicate);
}
 
Example #27
Source File: ParquetTableSource.java    From flink with Apache License 2.0 4 votes vote down vote up
@Nullable
private Tuple2<Column, Comparable> extractColumnAndLiteral(BinaryComparison comp) {
	String columnName = getColumnName(comp);
	ColumnPath columnPath = ColumnPath.fromDotString(columnName);
	TypeInformation<?> typeInfo = null;
	try {
		Type type = parquetSchema.getType(columnPath.toArray());
		typeInfo = ParquetSchemaConverter.convertParquetTypeToTypeInfo(type);
	} catch (InvalidRecordException e) {
		LOG.error("Pushed predicate on undefined field name {} in schema", columnName);
		return null;
	}

	// fetch literal and ensure it is comparable
	Object value = getLiteral(comp);
	// validate that literal is comparable
	if (!(value instanceof Comparable)) {
		LOG.warn("Encountered a non-comparable literal of type {}." +
			"Cannot push predicate [{}] into ParquetTablesource." +
			"This is a bug and should be reported.", value.getClass().getCanonicalName(), comp);
		return null;
	}

	if (typeInfo == BasicTypeInfo.BYTE_TYPE_INFO ||
		typeInfo == BasicTypeInfo.SHORT_TYPE_INFO ||
		typeInfo == BasicTypeInfo.INT_TYPE_INFO) {
		return new Tuple2<>(FilterApi.intColumn(columnName), ((Number) value).intValue());
	} else if (typeInfo == BasicTypeInfo.LONG_TYPE_INFO) {
		return new Tuple2<>(FilterApi.longColumn(columnName), ((Number) value).longValue());
	} else if (typeInfo == BasicTypeInfo.FLOAT_TYPE_INFO) {
		return new Tuple2<>(FilterApi.floatColumn(columnName), ((Number) value).floatValue());
	} else if (typeInfo == BasicTypeInfo.BOOLEAN_TYPE_INFO) {
		return new Tuple2<>(FilterApi.booleanColumn(columnName), (Boolean) value);
	} else if (typeInfo == BasicTypeInfo.DOUBLE_TYPE_INFO) {
		return new Tuple2<>(FilterApi.doubleColumn(columnName), ((Number) value).doubleValue());
	} else if (typeInfo == BasicTypeInfo.STRING_TYPE_INFO) {
		return new Tuple2<>(FilterApi.binaryColumn(columnName), Binary.fromString((String) value));
	} else {
		// unsupported type
		return null;
	}
}
 
Example #28
Source File: ParquetTableSourceTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testFieldsFilter() throws Exception {
	ParquetTableSource parquetTableSource = createNestedTestParquetTableSource(testPath);

	// expressions for supported predicates
	Expression exp1 = new GreaterThan(
		new PlannerResolvedFieldReference("foo", Types.LONG),
		new Literal(100L, Types.LONG));
	Expression exp2 = new EqualTo(
		new Literal(100L, Types.LONG),
		new PlannerResolvedFieldReference("bar.spam", Types.LONG));

	// unsupported predicate
	Expression unsupported = new EqualTo(
		new GetCompositeField(
			new ItemAt(
				new PlannerResolvedFieldReference(
					"nestedArray",
					ObjectArrayTypeInfo.getInfoFor(
						Types.ROW_NAMED(new String[] {"type", "name"}, Types.STRING, Types.STRING))),
					new Literal(1, Types.INT)),
					"type"),
		new Literal("test", Types.STRING));
	// invalid predicate
	Expression invalidPred = new EqualTo(
		new PlannerResolvedFieldReference("nonField", Types.LONG),
		// some invalid, non-serializable, literal (here an object of this test class)
		new Literal(new ParquetTableSourceTest(), Types.LONG)
	);

	List<Expression> exps = new ArrayList<>();
	exps.add(exp1);
	exps.add(exp2);
	exps.add(unsupported);
	exps.add(invalidPred);

	// apply predict on TableSource
	ParquetTableSource filtered = (ParquetTableSource) parquetTableSource.applyPredicate(exps);

	// ensure copy is returned
	assertNotSame(parquetTableSource, filtered);

	// ensure table schema is identical
	assertEquals(parquetTableSource.getTableSchema(), filtered.getTableSchema());

	// ensure return type is identical
	assertEquals(NESTED_ROW_TYPE, filtered.getReturnType());

	// ensure source description is not the same
	assertNotEquals(parquetTableSource.explainSource(), filtered.explainSource());

	// check that pushdown was recorded
	assertTrue(filtered.isFilterPushedDown());
	assertFalse(parquetTableSource.isFilterPushedDown());

	// ensure that supported predicates were removed from list of offered expressions
	assertEquals(2, exps.size());
	assertTrue(exps.contains(unsupported));
	assertTrue(exps.contains(invalidPred));

	// ensure ParquetInputFormat is correctly configured with filter
	DataSet<Row> data = filtered.getDataSet(ExecutionEnvironment.createLocalEnvironment());
	InputFormat<Row, ?> inputFormat = ((DataSource<Row>) data).getInputFormat();
	assertTrue(inputFormat instanceof ParquetRowInputFormat);
	ParquetRowInputFormat parquetIF = (ParquetRowInputFormat) inputFormat;

	// expected predicate
	FilterPredicate a = FilterApi.gt(FilterApi.longColumn("foo"), 100L);
	FilterPredicate b = FilterApi.eq(FilterApi.longColumn("bar.spam"), 100L);
	FilterPredicate expected = FilterApi.and(a, b);
	// actual predicate
	FilterPredicate predicate = parquetIF.getPredicate();
	// check predicate
	assertEquals(expected, predicate);
}
 
Example #29
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void testReadWriteFilter() throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration();

  // this filter predicate should keep some records but not all (first 500 characters)
  // "line" is actually position in the file...
  ParquetInputFormat.setFilterPredicate(conf, FilterApi.lt(FilterApi.intColumn("line"), 500));
  final String fpString = conf.get(ParquetInputFormat.FILTER_PREDICATE);

  runMapReduceJob(CompressionCodecName.UNCOMPRESSED, new HashMap<String, String>() {{
    put("parquet.task.side.metadata", "true");
    put(ParquetInputFormat.FILTER_PREDICATE, fpString);
  }});

  File file = new File(inputPath.toString());
  List<String> expected = Files.readAllLines(file.toPath(), StandardCharsets.UTF_8);

  // grab the lines that contain the first 500 characters (including the rest of the line past 500 characters)
  int size = 0;
  Iterator<String> iter = expected.iterator();
  while(iter.hasNext()) {
    String next = iter.next();

    if (size < 500) {
      size += next.length();
      continue;
    }

    iter.remove();
  }

  // put the output back into it's original format (remove the character counts / tabs)
  File file2 = new File(outputPath.toString(), "part-m-00000");
  List<String> found = Files.readAllLines(file2.toPath(), StandardCharsets.UTF_8);
  StringBuilder sbFound = new StringBuilder();
  for (String line : found) {
    sbFound.append(line.split("\t", -1)[1]);
    sbFound.append("\n");
  }

  sbFound.deleteCharAt(sbFound.length() - 1);

  assertEquals(String.join("\n", expected), sbFound.toString());
}