org.apache.parquet.filter2.compat.FilterCompat.Filter Java Examples

The following examples show how to use org.apache.parquet.filter2.compat.FilterCompat.Filter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
Example #2
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static List<Group> readFile(File f, Filter filter) throws IOException {
  ParquetReader<Group> reader = createReader(new Path(f.getAbsolutePath()), filter);

  Group current;
  List<Group> users = new ArrayList<Group>();

  current = reader.read();
  while (current != null) {
    users.add(current);
    current = reader.read();
  }

  return users;
}
 
Example #3
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ParquetReader<Group> createReader(Path file, Filter filter) throws IOException {
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  return ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(conf)
      .withFilter(filter)
      .build();
}
 
Example #4
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<User> readUsersWithProjection(Filter filter, MessageType schema, boolean useOtherFiltering, boolean useColumnIndexFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .useDictionaryFilter(useOtherFiltering)
      .useStatsFilter(useOtherFiltering)
      .useRecordFilter(useOtherFiltering)
      .useColumnIndexFilter(useColumnIndexFilter)
      .set(ReadSupport.PARQUET_READ_SCHEMA, schema.toString()));
}
 
Example #5
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<User> readUsers(Filter filter, boolean useOtherFiltering, boolean useColumnIndexFilter)
    throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .useDictionaryFilter(useOtherFiltering)
      .useStatsFilter(useOtherFiltering)
      .useRecordFilter(useOtherFiltering)
      .useColumnIndexFilter(useColumnIndexFilter));
}
 
Example #6
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}
 
Example #7
Source File: TestMultipleWriteRead.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateFileWithComplexFilter(Path file, List<Group> data) throws IOException {
  Binary binaryValueB = fromString("b");
  Filter filter = FilterCompat.get(
      and(
          gtEq(intColumn("id"), 0),
          and(
              lt(binaryColumn("name"), binaryValueB),
              notEq(binaryColumn("comment"), null))));
  Predicate<Group> predicate = group -> group.getInteger("id", 0) >= 0
      && BINARY_COMPARATOR.compare(group.getBinary("name", 0), binaryValueB) < 0
      && group.getFieldRepetitionCount("comment") > 0;
  validateFile(file, filter, data.stream().filter(predicate));
}
 
Example #8
Source File: TestMultipleWriteRead.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateFile(Path file, Filter filter, Stream<Group> data) throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .build()) {
    for (Iterator<Group> it = data.iterator(); it.hasNext();) {
      assertEquals(it.next().toString(), reader.read().toString());
    }
  }
}
 
Example #9
Source File: InternalParquetRecordReader.java    From tajo with Apache License 2.0 4 votes vote down vote up
/**
 * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro.
 * @param filter for filtering individual records
 */
public InternalParquetRecordReader(ReadSupport<T> readSupport, Filter filter) {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
}
 
Example #10
Source File: ThriftParquetReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public Builder<T> withFilter(Filter filter) {
  this.filter = Objects.requireNonNull(filter, "filter cannot be null");
  return this;
}
 
Example #11
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private List<User> readUsers(Filter filter, boolean useOtherFiltering) throws IOException {
  return readUsers(filter, useOtherFiltering, true);
}
 
Example #12
Source File: RowGroupFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static List<BlockMetaData> filterRowGroups(List<FilterLevel> levels, Filter filter, List<BlockMetaData> blocks, ParquetFileReader reader) {
  Objects.requireNonNull(filter, "filter cannot be null");
  return filter.accept(new RowGroupFilter(levels, blocks, reader));
}
 
Example #13
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers,
    long maxSplitSize, long minSplitSize, ReadContext readContext)
    throws IOException {
  List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
  Filter filter = ParquetInputFormat.getFilter(configuration);

  long rowGroupsDropped = 0;
  long totalRowGroups = 0;

  for (Footer footer : footers) {
    final Path file = footer.getFile();
    LOG.debug("{}", file);
    FileSystem fs = file.getFileSystem(configuration);
    FileStatus fileStatus = fs.getFileStatus(file);
    ParquetMetadata parquetMetaData = footer.getParquetMetadata();
    List<BlockMetaData> blocks = parquetMetaData.getBlocks();

    List<BlockMetaData> filteredBlocks;

    totalRowGroups += blocks.size();
    filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
    rowGroupsDropped += blocks.size() - filteredBlocks.size();

    if (filteredBlocks.isEmpty()) {
      continue;
    }

    BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
    splits.addAll(
        generateSplits(
            filteredBlocks,
            fileBlockLocations,
            fileStatus,
            readContext.getRequestedSchema().toString(),
            readContext.getReadSupportMetadata(),
            minSplitSize,
            maxSplitSize)
        );
  }

  if (rowGroupsDropped > 0 && totalRowGroups > 0) {
    int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
    LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped);
  } else {
    LOG.info("There were no row groups that could be dropped due to filter predicates");
  }
  return splits;
}
 
Example #14
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public Builder<T> withFilter(Filter filter) {
  this.filter = filter;
  optionsBuilder.withRecordFilter(filter);
  return this;
}
 
Example #15
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro.
 * @param filter for filtering individual records
 */
public InternalParquetRecordReader(ReadSupport<T> readSupport, Filter filter) {
  this.readSupport = readSupport;
  this.filter = Objects.requireNonNull(filter, "filter cannot be null");
}
 
Example #16
Source File: MessageColumnIO.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public <T> RecordReader<T> getRecordReader(final PageReadStore columns,
                                           final RecordMaterializer<T> recordMaterializer,
                                           final Filter filter) {
  Objects.requireNonNull(columns, "columns cannot be null");
  Objects.requireNonNull(recordMaterializer, "recordMaterializer cannot be null");
  Objects.requireNonNull(filter, "filter cannot be null");

  if (leaves.isEmpty()) {
    return new EmptyRecordReader<>(recordMaterializer);
  }

  return filter.accept(new Visitor<RecordReader<T>>() {
    @Override
    public RecordReader<T> visit(FilterPredicateCompat filterPredicateCompat) {

      FilterPredicate predicate = filterPredicateCompat.getFilterPredicate();
      IncrementallyUpdatedFilterPredicateBuilder builder = new IncrementallyUpdatedFilterPredicateBuilder(leaves);
      IncrementallyUpdatedFilterPredicate streamingPredicate = builder.build(predicate);
      RecordMaterializer<T> filteringRecordMaterializer = new FilteringRecordMaterializer<T>(
          recordMaterializer,
          leaves,
          builder.getValueInspectorsByColumn(),
          streamingPredicate);

      return new RecordReaderImplementation<>(
          MessageColumnIO.this,
          filteringRecordMaterializer,
          validating,
          new ColumnReadStoreImpl(columns, filteringRecordMaterializer.getRootConverter(), getType(), createdBy));
    }

    @Override
    public RecordReader<T> visit(UnboundRecordFilterCompat unboundRecordFilterCompat) {
      return new FilteredRecordReader<>(
          MessageColumnIO.this,
          recordMaterializer,
          validating,
          new ColumnReadStoreImpl(columns, recordMaterializer.getRootConverter(), getType(), createdBy),
          unboundRecordFilterCompat.getUnboundRecordFilter(),
          columns.getRowCount()
      );
    }

    @Override
    public RecordReader<T> visit(NoOpFilter noOpFilter) {
      return new RecordReaderImplementation<>(
          MessageColumnIO.this,
          recordMaterializer,
          validating,
          new ColumnReadStoreImpl(columns, recordMaterializer.getRootConverter(), getType(), createdBy));
    }
  });
}
 
Example #17
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 4 votes vote down vote up
public ParquetRecordReader(ReadSupport<T> readSupport, MessageType readSchema, Filter filter) {
	this.filter = checkNotNull(filter, "readSupport");
	this.readSupport = checkNotNull(readSupport, "readSchema");
	this.readSchema = checkNotNull(readSchema, "filter");
}
 
Example #18
Source File: ParquetReader.java    From tajo with Apache License 2.0 4 votes vote down vote up
public Builder<T> withFilter(Filter filter) {
  this.filter = checkNotNull(filter, "filter");
  return this;
}
 
Example #19
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 4 votes vote down vote up
public ParquetRecordReader(ReadSupport<T> readSupport, MessageType readSchema, Filter filter) {
	this.filter = checkNotNull(filter, "readSupport");
	this.readSupport = checkNotNull(readSupport, "readSchema");
	this.readSchema = checkNotNull(readSchema, "filter");
}
 
Example #20
Source File: RowGroupFilter.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * @param filter a filter
 * @param blocks a list of block metadata to filter
 * @param schema the file schema
 * @return a filtered list of block metadata
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public static List<BlockMetaData> filterRowGroups(Filter filter, List<BlockMetaData> blocks, MessageType schema) {
 Objects.requireNonNull(filter, "filter cannot be null");
  return filter.accept(new RowGroupFilter(blocks, schema));
}
 
Example #21
Source File: ParquetRecordReader.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro.
 * @param filter for filtering individual records
 */
public ParquetRecordReader(ReadSupport<T> readSupport, Filter filter) {
  internalReader = new InternalParquetRecordReader<T>(readSupport, filter);
}
 
Example #22
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * Returns a non-null Filter, which is a wrapper around either a
 * FilterPredicate, an UnboundRecordFilter, or a no-op filter.
 *
 * @param conf a configuration
 * @return a filter for the unbound record filter specified in conf
 */
public static Filter getFilter(Configuration conf) {
  return FilterCompat.get(getFilterPredicate(conf), getUnboundRecordFilterInstance(conf));
}