Java Code Examples for org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getStartingPos()

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnChunkMetaData#getStartingPos() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ColumnIndexFilterUtils.java From parquet-mr with Apache License 2.0

6 votes

static List<OffsetRange> calculateOffsetRanges(OffsetIndex offsetIndex, ColumnChunkMetaData cm,
    long firstPageOffset) {
  List<OffsetRange> ranges = new ArrayList<>();
  int n = offsetIndex.getPageCount();
  if (n > 0) {
    OffsetRange currentRange = null;

    // Add a range for the dictionary page if required
    long rowGroupOffset = cm.getStartingPos();
    if (rowGroupOffset < firstPageOffset) {
      currentRange = new OffsetRange(rowGroupOffset, (int) (firstPageOffset - rowGroupOffset));
      ranges.add(currentRange);
    }

    for (int i = 0; i < n; ++i) {
      long offset = offsetIndex.getOffset(i);
      int length = offsetIndex.getCompressedPageSize(i);
      if (currentRange == null || !currentRange.extend(offset, length)) {
        currentRange = new OffsetRange(offset, length);
        ranges.add(currentRange);
      }
    }
  }
  return ranges;
}

Example 2

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
  if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
      !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
    return null;
  }

  // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
  if (f.getPos() != meta.getStartingPos()) {
    f.seek(meta.getStartingPos());
  }

  PageHeader pageHeader = Util.readPageHeader(f);
  if (!pageHeader.isSetDictionary_page_header()) {
    return null; // TODO: should this complain?
  }

  DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
  BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

  return new DictionaryPage(
      decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
      compressedPage.getDictionarySize(),
      compressedPage.getEncoding());
}

Example 3

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

6 votes

public void assertColumnsEquivalent(List<ColumnChunkMetaData> expected,
                                    List<ColumnChunkMetaData> actual) {
  Assert.assertEquals("Should have the expected columns",
      expected.size(), actual.size());
  for (int i = 0; i < actual.size(); i += 1) {
    ColumnChunkMetaData current = actual.get(i);
    if (i != 0) {
      ColumnChunkMetaData previous = actual.get(i - 1);
      long expectedStart = previous.getStartingPos() + previous.getTotalSize();
      Assert.assertEquals("Should start after the previous column",
          expectedStart, current.getStartingPos());
    }

    assertColumnMetadataEquivalent(expected.get(i), current);
  }
}

Example 4

Source File: PageReader.java From Bats with Apache License 2.0

5 votes

PageReader(org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentStatus, FileSystem fs, Path path, ColumnChunkMetaData columnChunkMetaData)
  throws ExecutionSetupException {
  this.parentColumnReader = parentStatus;
  allocatedDictionaryBuffers = new ArrayList<ByteBuf>();
  codecFactory = parentColumnReader.parentReader.getCodecFactory();
  this.stats = parentColumnReader.parentReader.parquetReaderStats;
  this.fileName = path.toString();
  debugName = new StringBuilder()
     .append(this.parentColumnReader.parentReader.getFragmentContext().getFragIdString())
     .append(":")
     .append(this.parentColumnReader.parentReader.getOperatorContext().getStats().getId() )
     .append(this.parentColumnReader.columnChunkMetaData.toString() )
     .toString();
  try {
    inputStream  = fs.open(path);
    BufferAllocator allocator =  parentColumnReader.parentReader.getOperatorContext().getAllocator();
    columnChunkMetaData.getTotalUncompressedSize();
    useBufferedReader  = parentColumnReader.parentReader.useBufferedReader;
    scanBufferSize = parentColumnReader.parentReader.bufferedReadSize;
    useFadvise = parentColumnReader.parentReader.useFadvise;
    enforceTotalSize = parentColumnReader.parentReader.enforceTotalSize;
    if (useBufferedReader) {
      this.dataReader = new BufferedDirectBufInputStream(inputStream, allocator, path.getName(),
          columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), scanBufferSize,
          enforceTotalSize, useFadvise);
    } else {
      this.dataReader = new DirectBufInputStream(inputStream, allocator, path.getName(),
          columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), enforceTotalSize,
          useFadvise);
    }
  } catch (IOException e) {
    throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
        + path.getName(), e);
  }

}

Example 5

Source File: ColumnChunkIncReadStore.java From Bats with Apache License 2.0

5 votes

public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, FSDataInputStream in) throws IOException {
  this.metaData = metaData;
  this.columnDescriptor = columnDescriptor;
  this.size = metaData.getTotalSize();
  this.fileOffset = metaData.getStartingPos();
  this.in = in;
  this.decompressor = codecFactory.getDecompressor(metaData.getCodec());
}

Example 6

Source File: ParquetReader.java From presto with Apache License 2.0

5 votes

public ParquetReader(
        Optional<String> fileCreatedBy,
        MessageColumnIO messageColumnIO,
        List<BlockMetaData> blocks,
        ParquetDataSource dataSource,
        AggregatedMemoryContext systemMemoryContext,
        ParquetReaderOptions options)
        throws IOException
{
    this.fileCreatedBy = requireNonNull(fileCreatedBy, "fileCreatedBy is null");
    this.columns = requireNonNull(messageColumnIO, "messageColumnIO is null").getLeaves();
    this.blocks = requireNonNull(blocks, "blocks is null");
    this.dataSource = requireNonNull(dataSource, "dataSource is null");
    this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null");
    this.currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext();
    this.options = requireNonNull(options, "options is null");
    this.columnReaders = new PrimitiveColumnReader[columns.size()];
    this.maxBytesPerCell = new long[columns.size()];

    Map<ChunkKey, DiskRange> ranges = new HashMap<>();
    for (int rowGroup = 0; rowGroup < blocks.size(); rowGroup++) {
        BlockMetaData metadata = blocks.get(rowGroup);
        for (PrimitiveColumnIO column : columns) {
            int columnId = column.getId();
            ColumnChunkMetaData chunkMetadata = getColumnChunkMetaData(metadata, column.getColumnDescriptor());
            DiskRange range = new DiskRange(chunkMetadata.getStartingPos(), toIntExact(chunkMetadata.getTotalSize()));
            ranges.put(new ChunkKey(columnId, rowGroup), range);
        }
    }

    this.chunkReaders = dataSource.planRead(ranges);
}

Example 7

Source File: ColumnChunkIncReadStore.java From dremio-oss with Apache License 2.0

5 votes

public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, BulkInputStream in) throws IOException {
  this.metaData = metaData;
  this.columnDescriptor = columnDescriptor;
  this.size = metaData.getTotalSize();
  this.fileOffset = metaData.getStartingPos();
  this.in = in;
  this.decompressor = codecFactory.getDecompressor(metaData.getCodec());
}

Example 8

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Reads all the columns requested from the row group at the current file position.
 * @throws IOException if an error occurs while reading
 * @return the PageReadStore which can provide PageReaders for each column.
 */
public PageReadStore readNextRowGroup() throws IOException {
  if (currentBlock == blocks.size()) {
    return null;
  }
  BlockMetaData block = blocks.get(currentBlock);
  if (block.getRowCount() == 0) {
    throw new RuntimeException("Illegal row group of 0 rows");
  }
  this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount());
  // prepare the list of consecutive parts to read them in one scan
  List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
  ConsecutivePartList currentParts = null;
  for (ColumnChunkMetaData mc : block.getColumns()) {
    ColumnPath pathKey = mc.getPath();
    BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
    ColumnDescriptor columnDescriptor = paths.get(pathKey);
    if (columnDescriptor != null) {
      long startingPos = mc.getStartingPos();
      // first part or not consecutive => new list
      if (currentParts == null || currentParts.endPos() != startingPos) {
        currentParts = new ConsecutivePartList(startingPos);
        allParts.add(currentParts);
      }
      currentParts.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize()));
    }
  }
  // actually read all the chunks
  ChunkListBuilder builder = new ChunkListBuilder();
  for (ConsecutivePartList consecutiveChunks : allParts) {
    consecutiveChunks.readAll(f, builder);
  }
  for (Chunk chunk : builder.build()) {
    currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
  }

  // avoid re-reading bytes the dictionary reader is used after this call
  if (nextDictionaryReader != null) {
    nextDictionaryReader.setRowGroup(currentRowGroup);
  }

  advanceToNextBlock();

  return currentRowGroup;
}