Java Code Examples for org.apache.spark.sql.sources.Filter

The following examples show how to use org.apache.spark.sql.sources.Filter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Test public void testValidFiltersForAvro() {
    ImmutableList<Filter> validFilters = ImmutableList.of(
            EqualTo.apply("foo", "manatee"),
            GreaterThan.apply("foo", "aardvark"),
            GreaterThanOrEqual.apply("bar", 2),
            LessThan.apply("foo", "zebra"),
            LessThanOrEqual.apply("bar", 1),
            In.apply("foo", new Object[] {1, 2, 3}),
            IsNull.apply("foo"),
            IsNotNull.apply("foo"),
            And.apply(IsNull.apply("foo"), IsNotNull.apply("bar")),
            Or.apply(IsNull.apply("foo"), IsNotNull.apply("foo")),
            Not.apply(IsNull.apply("foo")),
            StringStartsWith.apply("foo", "abc"),
            StringEndsWith.apply("foo", "def"),
            StringContains.apply("foo", "abcdef")
    );
    validFilters.forEach(f -> assertThat(SparkFilterUtils.unhandledFilters(AVRO, f)).isEmpty());
}
 
Example 2
@Test public void testValidFiltersForArrow() {
    ImmutableList<Filter> validFilters = ImmutableList.of(
            EqualTo.apply("foo", "manatee"),
            GreaterThan.apply("foo", "aardvark"),
            GreaterThanOrEqual.apply("bar", 2),
            LessThan.apply("foo", "zebra"),
            LessThanOrEqual.apply("bar", 1),
            In.apply("foo", new Object[] {1, 2, 3}),
            IsNull.apply("foo"),
            IsNotNull.apply("foo"),
            And.apply(IsNull.apply("foo"), IsNotNull.apply("bar")),
            Not.apply(IsNull.apply("foo")),
            StringStartsWith.apply("foo", "abc"),
            StringEndsWith.apply("foo", "def"),
            StringContains.apply("foo", "abcdef")
    );
    validFilters.forEach(f -> assertThat(SparkFilterUtils.unhandledFilters(ARROW, f)).isEmpty());
}
 
Example 3
@Override
public Filter[] pushFilters(Filter[] filters) {
  List<Filter> notPushed = Lists.newArrayList();
  List<Filter> pushed = Lists.newArrayList();
  for (Filter filter : filters) {
    boolean isPushed = canBePushed(filter);
    if (isPushed) {
      pushed.add(filter);
    } else {
      notPushed.add(filter);
    }
  }
  this.pushed = pushed.toArray(new Filter[0]);
  if (!pushed.isEmpty()) {
    String whereClause = generateWhereClause(pushed);
    mergeWhereDescriptors(whereClause);
    try (FlightClient client = clientFactory.apply()) {
      info = client.getSchema(descriptor);
    } catch (InterruptedException e) {
      throw new RuntimeException(e);
    }
  }
  return notPushed.toArray(new Filter[0]);
}
 
Example 4
private String generateWhereClause(List<Filter> pushed) {
  List<String> filterStr = Lists.newArrayList();
  for (Filter filter : pushed) {
    if (filter instanceof IsNotNull) {
      filterStr.add(String.format("isnotnull(\"%s\")", ((IsNotNull) filter).attribute()));
    } else if (filter instanceof EqualTo) {
      filterStr.add(String.format("\"%s\" = %s", ((EqualTo) filter).attribute(), valueToString(((EqualTo) filter).value())));
    } else if (filter instanceof GreaterThan) {
      filterStr.add(String.format("\"%s\" > %s", ((GreaterThan) filter).attribute(), valueToString(((GreaterThan) filter).value())));
    } else if (filter instanceof GreaterThanOrEqual) {
      filterStr.add(String.format("\"%s\" <= %s", ((GreaterThanOrEqual) filter).attribute(), valueToString(((GreaterThanOrEqual) filter).value())));
    } else if (filter instanceof LessThan) {
      filterStr.add(String.format("\"%s\" < %s", ((LessThan) filter).attribute(), valueToString(((LessThan) filter).value())));
    } else if (filter instanceof LessThanOrEqual) {
      filterStr.add(String.format("\"%s\" <= %s", ((LessThanOrEqual) filter).attribute(), valueToString(((LessThanOrEqual) filter).value())));
    }
    //todo fill out rest of Filter types
  }
  return WHERE_JOINER.join(filterStr);
}
 
Example 5
private boolean canBePushed(Filter filter) {
  if (filter instanceof IsNotNull) {
    return true;
  } else if (filter instanceof EqualTo) {
    return true;
  }
  if (filter instanceof GreaterThan) {
    return true;
  }
  if (filter instanceof GreaterThanOrEqual) {
    return true;
  }
  if (filter instanceof LessThan) {
    return true;
  }
  if (filter instanceof LessThanOrEqual) {
    return true;
  }
  LOGGER.error("Cant push filter of type " + filter.toString());
  return false;
}
 
Example 6
Source Project: iceberg   Source File: Reader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Filter[] pushFilters(Filter[] filters) {
  this.tasks = null; // invalidate cached tasks, if present

  List<Expression> expressions = Lists.newArrayListWithExpectedSize(filters.length);
  List<Filter> pushed = Lists.newArrayListWithExpectedSize(filters.length);

  for (Filter filter : filters) {
    Expression expr = SparkFilters.convert(filter);
    if (expr != null) {
      expressions.add(expr);
      pushed.add(filter);
    }
  }

  this.filterExpressions = expressions;
  this.pushedFilters = pushed.toArray(new Filter[0]);

  // invalidate the schema that will be projected
  this.schema = null;
  this.type = null;

  // Spark doesn't support residuals per task, so return all filters
  // to get Spark to handle record-level filtering
  return filters;
}
 
Example 7
Source Project: iceberg   Source File: SparkScanBuilder.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Filter[] pushFilters(Filter[] filters) {
  List<Expression> expressions = Lists.newArrayListWithExpectedSize(filters.length);
  List<Filter> pushed = Lists.newArrayListWithExpectedSize(filters.length);

  for (Filter filter : filters) {
    Expression expr = SparkFilters.convert(filter);
    if (expr != null) {
      try {
        Binder.bind(table.schema().asStruct(), expr, caseSensitive);
        expressions.add(expr);
        pushed.add(filter);
      } catch (ValidationException e) {
        // binding to the table schema failed, so this expression cannot be pushed down
      }
    }
  }

  this.filterExpressions = expressions;
  this.pushedFilters = pushed.toArray(new Filter[0]);

  // Spark doesn't support residuals per task, so return all filters
  // to get Spark to handle record-level filtering
  return filters;
}
 
Example 8
Source Project: iceberg   Source File: SparkTable.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void deleteWhere(Filter[] filters) {
  Expression deleteExpr = SparkFilters.convert(filters);
  DeleteFiles delete = icebergTable.newDelete()
      .set("spark.app.id", sparkSession().sparkContext().applicationId())
      .deleteFromRowFilter(deleteExpr);

  String genieId = sparkSession().sparkContext().hadoopConfiguration().get("genie.job.id");
  if (genieId != null) {
    delete.set("genie-id", genieId);
  }

  try {
    delete.commit();
  } catch (ValidationException e) {
    throw new IllegalArgumentException("Failed to cleanly delete data files matching: " + deleteExpr, e);
  }
}
 
Example 9
Source Project: spark-llap   Source File: HiveWarehouseDataSourceReader.java    License: Apache License 2.0 6 votes vote down vote up
String getQueryString(String[] requiredColumns, Filter[] filters) throws Exception {
  String selectCols = "count(*)";
  if (requiredColumns.length > 0) {
    selectCols = projections(requiredColumns);
  }
  String baseQuery;
  if (getQueryType() == StatementType.FULL_TABLE_SCAN) {
    baseQuery = selectStar(options.get("table"));
  } else {
    baseQuery = options.get("query");
  }

  Seq<Filter> filterSeq = asScalaBuffer(Arrays.asList(filters)).seq();
  String whereClause = buildWhereClause(baseSchema, filterSeq);
  return selectProjectAliasFilter(selectCols, baseQuery, randomAlias(), whereClause);
}
 
Example 10
@Override
public Filter[] pushFilters(Filter[] filters) {
    List<Filter> handledFilters = new ArrayList<>();
    List<Filter> unhandledFilters = new ArrayList<>();
    for (Filter filter : filters) {
        if (SparkFilterUtils.isHandled(filter, readSessionCreatorConfig.getReadDataFormat())) {
            handledFilters.add(filter);
        } else {
            unhandledFilters.add(filter);
        }
    }
    pushedFilters = handledFilters.stream().toArray(Filter[]::new);
    return unhandledFilters.stream().toArray(Filter[]::new);
}
 
Example 11
@Test public void testInvalidFiltersWithAvro() {
    Filter valid1 = EqualTo.apply("foo", "bar");
    Filter valid2 = EqualTo.apply("bar", 1);
    Filter invalid1 = EqualNullSafe.apply("foo", "bar");
    Filter invalid2 = And.apply(EqualTo.apply("foo", "bar"), Not.apply(EqualNullSafe.apply("bar", 1)));
    Iterable<Filter> unhandled = SparkFilterUtils.unhandledFilters(AVRO, valid1, valid2, invalid1, invalid2);
    assertThat(unhandled).containsExactly(invalid1, invalid2);
}
 
Example 12
@Test public void testInvalidFiltersWithArrow() {
            Filter valid1 = EqualTo.apply("foo", "bar");
    Filter valid2 = EqualTo.apply("bar", 1);
    Filter invalid1 = EqualNullSafe.apply("foo", "bar");
    Filter invalid2 = And.apply(EqualTo.apply("foo", "bar"), Not.apply(EqualNullSafe.apply("bar", 1)));
    Filter invalid3 = Or.apply(IsNull.apply("foo"), IsNotNull.apply("foo"));
    Iterable<Filter> unhandled = SparkFilterUtils.unhandledFilters(ARROW, valid1, valid2, invalid1, invalid2, invalid3);
    assertThat(unhandled).containsExactly(invalid1, invalid2, invalid3);
}
 
Example 13
private void checkFilters(
        DataFormat readDateFormat,
        String resultWithoutFilters,
        String resultWithFilters, 
        Optional<String> configFilter,
        Filter... filters) {
    String result1 = SparkFilterUtils.getCompiledFilter(readDateFormat, configFilter);
    assertThat(result1).isEqualTo(resultWithoutFilters);
    String result2 = SparkFilterUtils.getCompiledFilter(readDateFormat, configFilter, filters);
    assertThat(result2).isEqualTo(resultWithFilters);
}
 
Example 14
Source Project: iceberg   Source File: SparkWriteBuilder.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public WriteBuilder overwrite(Filter[] filters) {
  this.overwriteExpr = SparkFilters.convert(filters);
  if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) {
    // use the write option to override truncating the table. use dynamic overwrite instead.
    this.overwriteDynamic = true;
  } else {
    Preconditions.checkState(!overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr);
    this.overwriteByFilter = true;
  }
  return this;
}
 
Example 15
Source Project: iceberg   Source File: SparkFilters.java    License: Apache License 2.0 5 votes vote down vote up
public static Expression convert(Filter[] filters) {
  Expression expression = Expressions.alwaysTrue();
  for (Filter filter : filters) {
    Expression converted = convert(filter);
    Preconditions.checkArgument(converted != null, "Cannot convert filter to Iceberg: %s", filter);
    expression = Expressions.and(expression, converted);
  }
  return expression;
}
 
Example 16
Source Project: indexr   Source File: SparkFilter.java    License: Apache License 2.0 5 votes vote down vote up
public static RCOperator transform(List<ColumnSchema> schemas, List<Filter> filters) {
    List<Filter> validFilters = filters.stream().filter(f -> !(f instanceof IsNotNull)).collect(Collectors.toList());
    if (validFilters.size() == 0) {
        return null;
    }
    RCOperator op = new io.indexr.segment.rc.And(Trick.mapToList(validFilters, f -> transform(schemas, f)));
    op = op.optimize();
    return op;
}
 
Example 17
Source Project: spark-llap   Source File: HiveWarehouseDataSourceReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override public Filter[] pushFilters(Filter[] filters) {
  pushedFilters = Arrays.stream(filters).
      filter((filter) -> FilterPushdown.buildFilterExpression(baseSchema, filter).isDefined()).
      toArray(Filter[]::new);

  return Arrays.stream(filters).
      filter((filter) -> !FilterPushdown.buildFilterExpression(baseSchema, filter).isDefined()).
      toArray(Filter[]::new);
}
 
Example 18
@Override
public Filter[] pushedFilters() {
    return pushedFilters;
}
 
Example 19
@Test public void testMultipleValidFiltersAreHandled() {
    Filter valid1 = EqualTo.apply("foo", "bar");
    Filter valid2 = EqualTo.apply("bar", 1);
    assertThat(SparkFilterUtils.unhandledFilters(AVRO, valid1, valid2)).isEmpty();
}
 
Example 20
@Override
public Filter[] pushedFilters() {
  return pushed;
}
 
Example 21
Source Project: iceberg   Source File: Reader.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Filter[] pushedFilters() {
  return pushedFilters;
}
 
Example 22
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 4 votes vote down vote up
private void pushFilters(DataSourceReader reader, Filter... filters) {
  Assert.assertTrue(reader instanceof SupportsPushDownFilters);
  SupportsPushDownFilters filterable = (SupportsPushDownFilters) reader;
  filterable.pushFilters(filters);
}
 
Example 23
Source Project: iceberg   Source File: SparkScanBuilder.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Filter[] pushedFilters() {
  return pushedFilters;
}
 
Example 24
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 4 votes vote down vote up
private void pushFilters(ScanBuilder scan, Filter... filters) {
  Assert.assertTrue(scan instanceof SupportsPushDownFilters);
  SupportsPushDownFilters filterable = (SupportsPushDownFilters) scan;
  filterable.pushFilters(filters);
}
 
Example 25
Source Project: spark-llap   Source File: HiveWarehouseDataSourceReader.java    License: Apache License 2.0 4 votes vote down vote up
@Override public Filter[] pushedFilters() {
  return pushedFilters;
}