Java Code Examples for org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getPath()

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getPath() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ColumnIndexStoreImpl.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
IndexStoreImpl(ColumnChunkMetaData meta) {
  this.meta = meta;
  OffsetIndex oi;
  try {
    oi = reader.readOffsetIndex(meta);
  } catch (IOException e) {
    // If the I/O issue still stands it will fail the reading later;
    // otherwise we fail the filtering only with a missing offset index.
    LOGGER.warn("Unable to read offset index for column {}", meta.getPath(), e);
    oi = null;
  }
  if (oi == null) {
    throw new MissingOffsetIndexException(meta.getPath());
  }
  offsetIndex = oi;
}
 
Example 2
Source File: ColumnIndexStoreImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnIndexStoreImpl(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) {
  // TODO[GS]: Offset index for every paths will be required; pre-read the consecutive ones at once?
  // TODO[GS]: Pre-read column index based on filter?
  this.reader = reader;
  Map<ColumnPath, IndexStore> store = new HashMap<>();
  for (ColumnChunkMetaData column : block.getColumns()) {
    ColumnPath path = column.getPath();
    if (paths.contains(path)) {
      store.put(path, new IndexStoreImpl(column));
    }
  }
  this.store = store;
}
 
Example 3
Source File: ColumnIndexValidator.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
  List<ContractViolation> violations = new ArrayList<>();
  try (ParquetFileReader reader = ParquetFileReader.open(file)) {
    FileMetaData meta = reader.getFooter().getFileMetaData();
    MessageType schema = meta.getSchema();
    List<ColumnDescriptor> columns = schema.getColumns();

    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    int rowGroupNumber = 0;
    PageReadStore rowGroup = reader.readNextRowGroup();
    while (rowGroup != null) {
      ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup,
          new DummyRecordConverter(schema).getRootConverter(), schema, null);
      List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
      assert (columnChunks.size() == columns.size());
      for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
        ColumnDescriptor column = columns.get(columnNumber);
        ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
        ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
        if (columnIndex == null) {
          continue;
        }
        ColumnPath columnPath = columnChunk.getPath();
        OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
        List<ByteBuffer> minValues = columnIndex.getMinValues();
        List<ByteBuffer> maxValues = columnIndex.getMaxValues();
        BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
        List<Long> nullCounts = columnIndex.getNullCounts();
        List<Boolean> nullPages = columnIndex.getNullPages();
        long rowNumber = 0;
        ColumnReader columnReader = columnReadStore.getColumnReader(column);
        ByteBuffer prevMinValue = null;
        ByteBuffer prevMaxValue = null;
        for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
          boolean isNullPage = nullPages.get(pageNumber);
          ByteBuffer minValue = minValues.get(pageNumber);
          ByteBuffer maxValue = maxValues.get(pageNumber);
          PageValidator pageValidator = new PageValidator(
              column.getPrimitiveType(),
              rowGroupNumber, columnNumber, columnPath, pageNumber,
              violations, columnReader,
              minValue,
              maxValue,
              prevMinValue,
              prevMaxValue,
              boundaryOrder,
              nullCounts.get(pageNumber),
              isNullPage);
          if (!isNullPage) {
            prevMinValue = minValue;
            prevMaxValue = maxValue;
          }
          long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
          while (rowNumber <= lastRowNumberInPage) {
            pageValidator.validateValuesBelongingToRow();
            ++rowNumber;
          }
          pageValidator.finishPage();
        }
      }
      rowGroup = reader.readNextRowGroup();
      rowGroupNumber++;
    }
  }
  return violations;
}
 
Example 4
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * Reads all the columns requested from the row group at the current file position.
 * @throws IOException if an error occurs while reading
 * @return the PageReadStore which can provide PageReaders for each column.
 */
public PageReadStore readNextRowGroup() throws IOException {
  if (currentBlock == blocks.size()) {
    return null;
  }
  BlockMetaData block = blocks.get(currentBlock);
  if (block.getRowCount() == 0) {
    throw new RuntimeException("Illegal row group of 0 rows");
  }
  this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount());
  // prepare the list of consecutive parts to read them in one scan
  List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
  ConsecutivePartList currentParts = null;
  for (ColumnChunkMetaData mc : block.getColumns()) {
    ColumnPath pathKey = mc.getPath();
    BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
    ColumnDescriptor columnDescriptor = paths.get(pathKey);
    if (columnDescriptor != null) {
      long startingPos = mc.getStartingPos();
      // first part or not consecutive => new list
      if (currentParts == null || currentParts.endPos() != startingPos) {
        currentParts = new ConsecutivePartList(startingPos);
        allParts.add(currentParts);
      }
      currentParts.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize()));
    }
  }
  // actually read all the chunks
  ChunkListBuilder builder = new ChunkListBuilder();
  for (ConsecutivePartList consecutiveChunks : allParts) {
    consecutiveChunks.readAll(f, builder);
  }
  for (Chunk chunk : builder.build()) {
    currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
  }

  // avoid re-reading bytes the dictionary reader is used after this call
  if (nextDictionaryReader != null) {
    nextDictionaryReader.setRowGroup(currentRowGroup);
  }

  advanceToNextBlock();

  return currentRowGroup;
}
 
Example 5
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * Reads all the columns requested from the row group at the current file position. It may skip specific pages based
 * on the column indexes according to the actual filter. As the rows are not aligned among the pages of the different
 * columns row synchronization might be required. See the documentation of the class SynchronizingColumnReader for
 * details.
 *
 * @return the PageReadStore which can provide PageReaders for each column
 * @throws IOException
 *           if any I/O error occurs while reading
 */
public PageReadStore readNextFilteredRowGroup() throws IOException {
  if (currentBlock == blocks.size()) {
    return null;
  }
  if (!options.useColumnIndexFilter()) {
    return readNextRowGroup();
  }
  BlockMetaData block = blocks.get(currentBlock);
  if (block.getRowCount() == 0) {
    throw new RuntimeException("Illegal row group of 0 rows");
  }
  ColumnIndexStore ciStore = getColumnIndexStore(currentBlock);
  RowRanges rowRanges = getRowRanges(currentBlock);
  long rowCount = rowRanges.rowCount();
  if (rowCount == 0) {
    // There are no matching rows -> skipping this row-group
    advanceToNextBlock();
    return readNextFilteredRowGroup();
  }
  if (rowCount == block.getRowCount()) {
    // All rows are matching -> fall back to the non-filtering path
    return readNextRowGroup();
  }

  this.currentRowGroup = new ColumnChunkPageReadStore(rowRanges);
  // prepare the list of consecutive parts to read them in one scan
  ChunkListBuilder builder = new ChunkListBuilder();
  List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
  ConsecutivePartList currentParts = null;
  for (ColumnChunkMetaData mc : block.getColumns()) {
    ColumnPath pathKey = mc.getPath();
    ColumnDescriptor columnDescriptor = paths.get(pathKey);
    if (columnDescriptor != null) {
      OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());

      OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges,
          block.getRowCount());
      for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
        BenchmarkCounter.incrementTotalBytes(range.getLength());
        long startingPos = range.getOffset();
        // first part or not consecutive => new list
        if (currentParts == null || currentParts.endPos() != startingPos) {
          currentParts = new ConsecutivePartList(startingPos);
          allParts.add(currentParts);
        }
        ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos,
            (int) range.getLength());
        currentParts.addChunk(chunkDescriptor);
        builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
      }
    }
  }
  // actually read all the chunks
  for (ConsecutivePartList consecutiveChunks : allParts) {
    consecutiveChunks.readAll(f, builder);
  }
  for (Chunk chunk : builder.build()) {
    currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
  }

  // avoid re-reading bytes the dictionary reader is used after this call
  if (nextDictionaryReader != null) {
    nextDictionaryReader.setRowGroup(currentRowGroup);
  }

  advanceToNextBlock();

  return currentRowGroup;
}