org.apache.parquet.filter2.compat.FilterCompat Java Examples

The following examples show how to use org.apache.parquet.filter2.compat.FilterCompat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterOnInteger() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // Get first record
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("DocId", equalTo(10l))));

  readOne(recordReader, "r2 filtered out", r1);

  // Get second record
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", equalTo(20l))));

  readOne(recordReader, "r1 filtered out", r2);

}
 
Example #2
Source File: ColumnIndexFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Calculates the row ranges containing the indexes of the rows might match the specified filter.
 *
 * @param filter
 *          to be used for filtering the rows
 * @param columnIndexStore
 *          the store for providing column/offset indexes
 * @param paths
 *          the paths of the columns used in the actual projection; a column not being part of the projection will be
 *          handled as containing {@code null} values only even if the column has values written in the file
 * @param rowCount
 *          the total number of rows in the row-group
 * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows if any of
 *         the required offset index is missing
 */
public static RowRanges calculateRowRanges(FilterCompat.Filter filter, ColumnIndexStore columnIndexStore,
    Set<ColumnPath> paths, long rowCount) {
  return filter.accept(new FilterCompat.Visitor<RowRanges>() {
    @Override
    public RowRanges visit(FilterPredicateCompat filterPredicateCompat) {
      try {
        return filterPredicateCompat.getFilterPredicate()
            .accept(new ColumnIndexFilter(columnIndexStore, paths, rowCount));
      } catch (MissingOffsetIndexException e) {
        LOGGER.info(e.getMessage());
        return RowRanges.createSingle(rowCount);
      }
    }

    @Override
    public RowRanges visit(UnboundRecordFilterCompat unboundRecordFilterCompat) {
      return RowRanges.createSingle(rowCount);
    }

    @Override
    public RowRanges visit(NoOpFilter noOpFilter) {
      return RowRanges.createSingle(rowCount);
    }
  });
}
 
Example #3
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 6);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(page(4, 4)));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting records " + all, 4, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record", (i%2 == 0 ? r2 : r1).toString(), all.get(i).toString());
  }
}
 
Example #4
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteredAndPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(and(column("DocId", equalTo(10l)), page(2, 4))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 4 records " + all, 4, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record1", r1.toString(), all.get(i).toString());
  }

}
 
Example #5
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteredOrPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(or(column("DocId", equalTo(10l)),
              column("DocId", equalTo(20l)))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 8 records " + all, 16, all.size());
  for (int i = 0; i < all.size () / 2; i++) {
    assertEquals("expecting record1", r1.toString(), all.get(2 * i).toString());
    assertEquals("expecting record2", r2.toString(), all.get(2 * i + 1).toString());
  }
}
 
Example #6
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteredNotPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(not(column("DocId", equalTo(10l)))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 8 records " + all, 8, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record2", r2.toString(), all.get(i).toString());
  }
}
 
Example #7
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private List<BlockMetaData> filterRowGroups(List<BlockMetaData> blocks) throws IOException {
  // set up data filters based on configured levels
  List<RowGroupFilter.FilterLevel> levels = new ArrayList<>();

  if (options.useStatsFilter()) {
    levels.add(STATISTICS);
  }

  if (options.useDictionaryFilter()) {
    levels.add(DICTIONARY);
  }

  if (options.useBloomFilter()) {
    levels.add(BLOOMFILTER);
  }

  FilterCompat.Filter recordFilter = options.getRecordFilter();
  if (recordFilter != null) {
    return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this);
  }

  return blocks;
}
 
Example #8
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testApplyFunctionFilterOnLong() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // Get first record
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", equalTo(10l))));

  readOne(recordReader, "r2 filtered out", r1);

  // Get second record
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", applyFunctionToLong(new LongGreaterThan15Predicate()))));

  readOne(recordReader, "r1 filtered out", r2);
}
 
Example #9
Source File: HadoopReadOptions.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private HadoopReadOptions(boolean useSignedStringMinMax,
                          boolean useStatsFilter,
                          boolean useDictionaryFilter,
                          boolean useRecordFilter,
                          boolean useColumnIndexFilter,
                          boolean usePageChecksumVerification,
                          boolean useBloomFilter,
                          FilterCompat.Filter recordFilter,
                          MetadataFilter metadataFilter,
                          CompressionCodecFactory codecFactory,
                          ByteBufferAllocator allocator,
                          int maxAllocationSize,
                          Map<String, String> properties,
                          Configuration conf) {
  super(
      useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, useColumnIndexFilter,
      usePageChecksumVerification, useBloomFilter, recordFilter, metadataFilter, codecFactory, allocator,
      maxAllocationSize, properties
  );
  this.conf = conf;
}
 
Example #10
Source File: TestColumnIndexFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteringWithAllNullPages() {
  Set<ColumnPath> paths = paths("column1", "column5");

  assertAllRows(calculateRowRanges(FilterCompat.get(
      notEq(longColumn("column5"), 1234567L)),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertAllRows(calculateRowRanges(FilterCompat.get(
      or(gtEq(intColumn("column1"), 10),
          notEq(longColumn("column5"), 1234567L))),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertRows(calculateRowRanges(FilterCompat.get(
      eq(longColumn("column5"), 1234567L)),
      STORE, paths, TOTAL_ROW_COUNT));
  assertRows(calculateRowRanges(FilterCompat.get(
      and(lt(intColumn("column1"), 20),
          gtEq(longColumn("column5"), 1234567L))),
      STORE, paths, TOTAL_ROW_COUNT));
}
 
Example #11
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testNoFiltering() throws IOException {
  // Column index filtering with no-op filter
  assertEquals(DATA, readUsers(FilterCompat.NOOP, false));
  assertEquals(DATA, readUsers(FilterCompat.NOOP, true));

  // Column index filtering turned off
  assertEquals(DATA.stream().filter(user -> user.getId() == 1234).collect(Collectors.toList()),
      readUsers(eq(longColumn("id"), 1234l), true, false));
  assertEquals(DATA.stream().filter(user -> "miller".equals(user.getName())).collect(Collectors.toList()),
      readUsers(eq(binaryColumn("name"), Binary.fromString("miller")), true, false));
  assertEquals(DATA.stream().filter(user -> user.getName() == null).collect(Collectors.toList()),
      readUsers(eq(binaryColumn("name"), null), true, false));

  // Every filtering mechanism turned off
  assertEquals(DATA, readUsers(eq(longColumn("id"), 1234l), false, false));
  assertEquals(DATA, readUsers(eq(binaryColumn("name"), Binary.fromString("miller")), false, false));
  assertEquals(DATA, readUsers(eq(binaryColumn("name"), null), false, false));
}
 
Example #12
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteringWithProjection() throws IOException {
  // All rows shall be retrieved because all values in column 'name' shall be handled as null values
  assertEquals(
      DATA.stream().map(user -> user.cloneWithName(null)).collect(toList()),
      readUsersWithProjection(FilterCompat.get(eq(binaryColumn("name"), null)), SCHEMA_WITHOUT_NAME, true, true));

  // Column index filter shall drop all pages because all values in column 'name' shall be handled as null values
  assertEquals(
      emptyList(),
      readUsersWithProjection(FilterCompat.get(notEq(binaryColumn("name"), null)), SCHEMA_WITHOUT_NAME, false, true));
  assertEquals(
      emptyList(),
      readUsersWithProjection(FilterCompat.get(userDefined(binaryColumn("name"), NameStartsWithVowel.class)),
          SCHEMA_WITHOUT_NAME, false, true));
}
 
Example #13
Source File: TestFiltersWithMissingColumns.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), path)
      .withFilter(FilterCompat.get(pred))
      .build();

  long count = 0;
  try {
    while (reader.read() != null) {
      count += 1;
    }
  } finally {
    reader.close();
  }
  return count;
}
 
Example #14
Source File: TestRecordLevelFilters.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testUserDefinedByInstance() throws Exception {
  LongColumn name = longColumn("id");

  final HashSet<Long> h = new HashSet<Long>();
  h.add(20L); 
  h.add(27L);
  h.add(28L);
  
  FilterPredicate pred = userDefined(name, new SetInFilter(h));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));

  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return u != null && h.contains(u.getId());
    }
  });
}
 
Example #15
Source File: ParquetColumnarRowSplitReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public ParquetColumnarRowSplitReader(
		boolean utcTimestamp,
		boolean caseSensitive,
		Configuration conf,
		LogicalType[] selectedTypes,
		String[] selectedFieldNames,
		ColumnBatchGenerator generator,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	this.utcTimestamp = utcTimestamp;
	this.selectedTypes = selectedTypes;
	this.batchSize = batchSize;
	// then we need to apply the predicate push down filter
	ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength));
	MessageType fileSchema = footer.getFileMetaData().getSchema();
	FilterCompat.Filter filter = getFilter(conf);
	List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);

	this.fileSchema = footer.getFileMetaData().getSchema();
	this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive);
	this.reader = new ParquetFileReader(
			conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

	long totalRowCount = 0;
	for (BlockMetaData block : blocks) {
		totalRowCount += block.getRowCount();
	}
	this.totalRowCount = totalRowCount;
	this.nextRow = 0;
	this.rowsInBatch = 0;
	this.rowsReturned = 0;

	checkSchema();

	this.writableVectors = createWritableVectors();
	this.columnarBatch = generator.generate(createReadableVectors());
	this.row = new ColumnarRowData(columnarBatch);
}
 
Example #16
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://A"))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}
 
Example #17
Source File: TestFiltered.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testApplyFunctionFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", applyFunctionToString(new StringEndsWithAPredicate()))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}
 
Example #18
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Opens the resource for read.
 *
 * @throws IOException if opening the resource failed
 */
@Override
public boolean openForRead() throws IOException {
    file = new Path(context.getDataSource());
    FileSplit fileSplit = HdfsUtilities.parseFileSplit(context);

    // Read the original schema from the parquet file
    MessageType originalSchema = getSchema(file, fileSplit);
    // Get a map of the column name to Types for the given schema
    Map<String, Type> originalFieldsMap = getOriginalFieldsMap(originalSchema);
    // Get the read schema. This is either the full set or a subset (in
    // case of column projection) of the greenplum schema.
    MessageType readSchema = buildReadSchema(originalFieldsMap, originalSchema);
    // Get the record filter in case of predicate push-down
    FilterCompat.Filter recordFilter = getRecordFilter(context.getFilterString(), originalFieldsMap, readSchema);

    // add column projection
    configuration.set(PARQUET_READ_SCHEMA, readSchema.toString());

    fileReader = ParquetReader.builder(new GroupReadSupport(), file)
            .withConf(configuration)
            // Create reader for a given split, read a range in file
            .withFileRange(fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength())
            .withFilter(recordFilter)
            .build();
    context.setMetadata(readSchema);
    return true;
}
 
Example #19
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the parquet record filter for the given filter string
 *
 * @param filterString      the filter string
 * @param originalFieldsMap a map of field names to types
 * @param schema            the parquet schema
 * @return the parquet record filter for the given filter string
 */
private FilterCompat.Filter getRecordFilter(String filterString, Map<String, Type> originalFieldsMap, MessageType schema) {
    if (StringUtils.isBlank(filterString)) {
        return FilterCompat.NOOP;
    }

    ParquetRecordFilterBuilder filterBuilder = new ParquetRecordFilterBuilder(
            context.getTupleDescription(), originalFieldsMap);
    TreeVisitor pruner = new ParquetOperatorPrunerAndTransformer(
            context.getTupleDescription(), originalFieldsMap, SUPPORTED_OPERATORS);

    try {
        // Parse the filter string into a expression tree Node
        Node root = new FilterParser().parse(filterString);
        // Prune the parsed tree with valid supported operators and then
        // traverse the pruned tree with the ParquetRecordFilterBuilder to
        // produce a record filter for parquet
        TRAVERSER.traverse(root, pruner, filterBuilder);
        return filterBuilder.getRecordFilter();
    } catch (Exception e) {
        LOG.error(String.format("%s-%d: %s--%s Unable to generate Parquet Record Filter for filter",
                context.getTransactionId(),
                context.getSegmentId(),
                context.getDataSource(),
                context.getFilterString()), e);
        return FilterCompat.NOOP;
    }
}
 
Example #20
Source File: ParquetRecordFilterBuilder.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the built record filter
 *
 * @return the built record filter
 */
public FilterCompat.Filter getRecordFilter() {
    FilterPredicate predicate = filterQueue.poll();
    if (!filterQueue.isEmpty()) {
        throw new IllegalStateException("Filter queue is not empty after visiting all nodes");
    }
    return predicate != null ? FilterCompat.get(predicate) : FilterCompat.NOOP;
}
 
Example #21
Source File: ParquetInputFormat.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void open(FileInputSplit split) throws IOException {
	// reset the flag when open a new split
	this.skipThisSplit = false;
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);
	MessageType fileSchema = fileReader.getFileMetaData().getSchema();
	MessageType readSchema = getReadSchema(fileSchema, split.getPath());
	if (skipThisSplit) {
		LOG.warn(String.format(
			"Escaped the file split [%s] due to mismatch of file schema to expected result schema",
			split.getPath().toString()));
	} else {
		this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema,
			filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate));
		this.parquetRecordReader.initialize(fileReader, configuration);
		this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord);

		if (this.recordConsumed == null) {
			this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed");
		}

		LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString()));
	}
}
 
Example #22
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void checkRead() throws IOException {
  if (current == totalCountLoadedSoFar) {
    if (current != 0) {
      totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
      if (LOG.isInfoEnabled()) {
          LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
          final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
          if (totalTime != 0) {
              final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
              final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
              LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
          }
      }
    }

    LOG.info("at row " + current + ". reading next block");
    long t0 = System.currentTimeMillis();
    PageReadStore pages = reader.readNextFilteredRowGroup();
    if (pages == null) {
      throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
    }
    long timeSpentReading = System.currentTimeMillis() - t0;
    totalTimeSpentReadingBytes += timeSpentReading;
    BenchmarkCounter.incrementTime(timeSpentReading);
    if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
    LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
    MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
    recordReader = columnIO.getRecordReader(pages, recordConverter,
        filterRecords ? filter : FilterCompat.NOOP);
    startedAssemblingCurrentBlockAt = System.currentTimeMillis();
    totalCountLoadedSoFar += pages.getRowCount();
    ++ currentBlock;
  }
}
 
Example #23
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}
 
Example #24
Source File: ParquetReadOptions.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
ParquetReadOptions(boolean useSignedStringMinMax,
                   boolean useStatsFilter,
                   boolean useDictionaryFilter,
                   boolean useRecordFilter,
                   boolean useColumnIndexFilter,
                   boolean usePageChecksumVerification,
                   boolean useBloomFilter,
                   FilterCompat.Filter recordFilter,
                   ParquetMetadataConverter.MetadataFilter metadataFilter,
                   CompressionCodecFactory codecFactory,
                   ByteBufferAllocator allocator,
                   int maxAllocationSize,
                   Map<String, String> properties) {
  this.useSignedStringMinMax = useSignedStringMinMax;
  this.useStatsFilter = useStatsFilter;
  this.useDictionaryFilter = useDictionaryFilter;
  this.useRecordFilter = useRecordFilter;
  this.useColumnIndexFilter = useColumnIndexFilter;
  this.usePageChecksumVerification = usePageChecksumVerification;
  this.useBloomFilter = useBloomFilter;
  this.recordFilter = recordFilter;
  this.metadataFilter = metadataFilter;
  this.codecFactory = codecFactory;
  this.allocator = allocator;
  this.maxAllocationSize = maxAllocationSize;
  this.properties = Collections.unmodifiableMap(properties);
}
 
Example #25
Source File: TestBloomFiltering.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<PhoneBookWriter.User> readUsers(FilterPredicate filter, boolean useOtherFiltering,
                                             boolean useBloomFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
    .withFilter(FilterCompat.get(filter))
    .useDictionaryFilter(useOtherFiltering)
    .useStatsFilter(useOtherFiltering)
    .useRecordFilter(useOtherFiltering)
    .useBloomFilter(useBloomFilter)
    .useColumnIndexFilter(useOtherFiltering));
}
 
Example #26
Source File: TestMultipleWriteRead.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateFileWithComplexFilter(Path file, List<Group> data) throws IOException {
  Binary binaryValueB = fromString("b");
  Filter filter = FilterCompat.get(
      and(
          gtEq(intColumn("id"), 0),
          and(
              lt(binaryColumn("name"), binaryValueB),
              notEq(binaryColumn("comment"), null))));
  Predicate<Group> predicate = group -> group.getInteger("id", 0) >= 0
      && BINARY_COMPARATOR.compare(group.getBinary("name", 0), binaryValueB) < 0
      && group.getFieldRepetitionCount("comment") > 0;
  validateFile(file, filter, data.stream().filter(predicate));
}
 
Example #27
Source File: TestRecordLevelFilters.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoFilter() throws Exception {
  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.NOOP);
  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return true;
    }
  });
}
 
Example #28
Source File: TestRecordLevelFilters.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testAllFilter() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = eq(name, Binary.fromString("no matches"));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));
  assertEquals(new ArrayList<Group>(), found);
}
 
Example #29
Source File: TestRecordLevelFilters.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNameNotNull() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = notEq(name, null);

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));

  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return u.getName() != null;
    }
  });
}
 
Example #30
Source File: TestRecordLevelFilters.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNameNotStartWithP() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = not(userDefined(name, StartWithP.class));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));

  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return u.getName() == null || !u.getName().startsWith("p");
    }
  });
}