Java Code Examples for org.apache.parquet.hadoop.ParquetFileReader#readNextRowGroup()

The following examples show how to use org.apache.parquet.hadoop.ParquetFileReader#readNextRowGroup() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}
 
Example 2
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
Example 3
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}
 
Example 4
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}