Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData

The following examples show how to use org.apache.parquet.hadoop.metadata.BlockMetaData. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: presto   Author: prestosql   File: PredicateUtils.java    License: Apache License 2.0 6 votes vote down vote up
private static boolean dictionaryPredicatesMatch(Predicate parquetPredicate, BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain)
{
    for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
        RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
        if (descriptor != null) {
            if (isOnlyDictionaryEncodingPages(columnMetaData) && isColumnPredicate(descriptor, parquetTupleDomain)) {
                byte[] buffer = new byte[toIntExact(columnMetaData.getTotalSize())];
                dataSource.readFully(columnMetaData.getStartingPos(), buffer);
                //  Early abort, predicate already filters block so no more dictionaries need be read
                if (!parquetPredicate.matches(new DictionaryDescriptor(descriptor, readDictionaryPage(buffer, columnMetaData.getCodec())))) {
                    return false;
                }
            }
        }
    }
    return true;
}
 
Example #2
Source Project: iceberg   Author: apache   File: TestMetricsRowGroupFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroRecordFileParquet() {
  Assume.assumeTrue(format == FileFormat.PARQUET);
  BlockMetaData emptyBlock = new BlockMetaData();
  emptyBlock.setRowCount(0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example #3
Source Project: iceberg   Author: apache   File: ReadConf.java    License: Apache License 2.0 6 votes vote down vote up
private List<Map<ColumnPath, ColumnChunkMetaData>> getColumnChunkMetadataForRowGroups() {
  Set<ColumnPath> projectedColumns = projection.getColumns().stream()
      .map(columnDescriptor -> ColumnPath.get(columnDescriptor.getPath())).collect(Collectors.toSet());
  ImmutableList.Builder<Map<ColumnPath, ColumnChunkMetaData>> listBuilder = ImmutableList.builder();
  for (int i = 0; i < rowGroups.size(); i++) {
    if (!shouldSkip[i]) {
      BlockMetaData blockMetaData = rowGroups.get(i);
      ImmutableMap.Builder<ColumnPath, ColumnChunkMetaData> mapBuilder = ImmutableMap.builder();
      blockMetaData.getColumns().stream()
          .filter(columnChunkMetaData -> projectedColumns.contains(columnChunkMetaData.getPath()))
          .forEach(columnChunkMetaData -> mapBuilder.put(columnChunkMetaData.getPath(), columnChunkMetaData));
      listBuilder.add(mapBuilder.build());
    } else {
      listBuilder.add(ImmutableMap.of());
    }
  }
  return listBuilder.build();
}
 
Example #4
Source Project: parquet-mr   Author: apache   File: ParquetMetadataConverter.java    License: Apache License 2.0 6 votes vote down vote up
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
  List<BlockMetaData> blocks = parquetMetadata.getBlocks();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long numRows = 0;
  for (BlockMetaData block : blocks) {
    numRows += block.getRowCount();
    addRowGroup(parquetMetadata, rowGroups, block);
  }
  FileMetaData fileMetaData = new FileMetaData(
      currentVersion,
      toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
      numRows,
      rowGroups);

  Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
  for (Entry<String, String> keyValue : keyValues) {
    addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
  }

  fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());

  fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));

  return fileMetaData;
}
 
Example #5
Source Project: iceberg   Author: Netflix   File: ParquetMetricsRowGroupFilter.java    License: Apache License 2.0 6 votes vote down vote up
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visit(expr, this);
}
 
Example #6
Source Project: iceberg   Author: Netflix   File: TestMetricsRowGroupFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroRecordFile() {
  BlockMetaData emptyBlock = new BlockMetaData();
  emptyBlock.setRowCount(0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr)
        .shouldRead(PARQUET_SCHEMA, emptyBlock);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example #7
Source Project: dremio-oss   Author: dremio   File: ParquetReaderUtility.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
public static List<Integer> getRowGroupNumbersFromFileSplit(final long splitStart, final long splitLength,
                                                             final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();
  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}
 
Example #8
Source Project: tajo   Author: apache   File: InternalParquetRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
Example #9
Source Project: tajo   Author: apache   File: ParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
Example #10
Source Project: parquet-mr   Author: apache   File: PrintFooter.java    License: Apache License 2.0 6 votes vote down vote up
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
Example #11
Source Project: parquet-mr   Author: apache   File: ParquetFileReader.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
 
Example #12
Source Project: parquet-mr   Author: apache   File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}
 
Example #13
Source Project: parquet-mr   Author: apache   File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}
 
Example #14
Source Project: parquet-mr   Author: apache   File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @param files a list of files to merge metadata from
 * @param conf a configuration
 * @return merged parquet metadata for the files
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files,  Configuration conf) throws IOException {
  Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

  GlobalMetaData globalMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  for (Path p : files) {
    ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
    FileMetaData fmd = pmd.getFileMetaData();
    globalMetaData = mergeInto(fmd, globalMetaData, true);
    blocks.addAll(pmd.getBlocks());
  }

  // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
  return new ParquetMetadata(globalMetaData.merge(), blocks);
}
 
Example #15
Source Project: parquet-mr   Author: apache   File: ParquetInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * groups together all the data blocks for the same HDFS block
 *
 * @param rowGroupBlocks      data blocks (row groups)
 * @param hdfsBlocksArray     hdfs blocks
 * @param fileStatus          the containing file
 * @param requestedSchema     the schema requested by the user
 * @param readSupportMetadata the metadata provided by the readSupport implementation in init
 * @param minSplitSize        the mapred.min.split.size
 * @param maxSplitSize        the mapred.max.split.size
 * @return the splits (one per HDFS block)
 * @throws IOException If hosts can't be retrieved for the HDFS block
 */
static <T> List<ParquetInputSplit> generateSplits(
        List<BlockMetaData> rowGroupBlocks,
        BlockLocation[] hdfsBlocksArray,
        FileStatus fileStatus,
        String requestedSchema,
        Map<String, String> readSupportMetadata, long minSplitSize, long maxSplitSize) throws IOException {

  List<SplitInfo> splitRowGroups =
      generateSplitInfo(rowGroupBlocks, hdfsBlocksArray, minSplitSize, maxSplitSize);

  //generate splits from rowGroups of each split
  List<ParquetInputSplit> resultSplits = new ArrayList<ParquetInputSplit>();
  for (SplitInfo splitInfo : splitRowGroups) {
    ParquetInputSplit split = splitInfo.getParquetInputSplit(fileStatus, requestedSchema, readSupportMetadata);
    resultSplits.add(split);
  }
  return resultSplits;
}
 
Example #16
Source Project: parquet-mr   Author: apache   File: CompressionConverter.java    License: Apache License 2.0 6 votes vote down vote up
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}
 
Example #17
Source Project: parquet-mr   Author: apache   File: ParquetRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
  // splitting files?
  if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
    // this is okay if not using DELTA_BYTE_ARRAY with the bug
    Set<Encoding> encodings = new HashSet<Encoding>();
    for (ColumnChunkMetaData column : block.getColumns()) {
      encodings.addAll(column.getEncodings());
    }
    for (Encoding encoding : encodings) {
      if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
        throw new ParquetDecodingException("Cannot read data due to " +
            "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
      }
    }
  }
}
 
Example #18
Source Project: Bats   Author: lealone   File: ReadState.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Create the readers needed to read columns: fixed-length or variable length.
 *
 * @param reader
 * @param output
 * @throws Exception
 */

@SuppressWarnings("unchecked")
public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception {
  final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
  // initialize all of the column read status objects
  BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata();
  Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata);
  for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
    ColumnDescriptor column = columnMetadata.column;
    columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get(
                    columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
    columnMetadata.buildVector(output);
    if (! columnMetadata.isFixedLength( )) {
      // create a reader and add it to the appropriate list
      varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader));
    } else if (columnMetadata.isRepeated()) {
      varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader));
    }
    else {
      fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader));
    }
  }
  varLengthReader = new VarLenBinaryReader(reader, varLengthColumns);
  if (! schema.isStarQuery()) {
    schema.createNonExistentColumns(output, nullFilledVectors);
  }
}
 
Example #19
Source Project: Bats   Author: lealone   File: ParquetSchema.java    License: Apache License 2.0 5 votes vote down vote up
Map<String, Integer> buildChunkMap(BlockMetaData rowGroupMetadata) {
  // the column chunk meta-data is not guaranteed to be in the same order as the columns in the schema
  // a map is constructed for fast access to the correct columnChunkMetadata to correspond
  // to an element in the schema
  Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>();

  int colChunkIndex = 0;
  for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) {
    columnChunkMetadataPositionsInList.put(Arrays.toString(colChunk.getPath().toArray()), colChunkIndex);
    colChunkIndex++;
  }
  return columnChunkMetadataPositionsInList;
}
 
Example #20
Source Project: presto   Author: prestosql   File: PredicateUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, boolean failOnCorruptedParquetStatistics)
        throws ParquetCorruptionException
{
    Map<ColumnDescriptor, Statistics<?>> columnStatistics = getStatistics(block, descriptorsByPath);
    if (!parquetPredicate.matches(block.getRowCount(), columnStatistics, dataSource.getId(), failOnCorruptedParquetStatistics)) {
        return false;
    }

    return dictionaryPredicatesMatch(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain);
}
 
Example #21
Source Project: presto   Author: prestosql   File: PredicateUtils.java    License: Apache License 2.0 5 votes vote down vote up
private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath)
{
    ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder();
    for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
        Statistics<?> columnStatistics = columnMetaData.getStatistics();
        if (columnStatistics != null) {
            RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
            if (descriptor != null) {
                statistics.put(descriptor, columnStatistics);
            }
        }
    }
    return statistics.build();
}
 
Example #22
Source Project: presto   Author: prestosql   File: ParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetReader(
        Optional<String> fileCreatedBy,
        MessageColumnIO messageColumnIO,
        List<BlockMetaData> blocks,
        ParquetDataSource dataSource,
        AggregatedMemoryContext systemMemoryContext,
        ParquetReaderOptions options)
        throws IOException
{
    this.fileCreatedBy = requireNonNull(fileCreatedBy, "fileCreatedBy is null");
    this.columns = requireNonNull(messageColumnIO, "messageColumnIO is null").getLeaves();
    this.blocks = requireNonNull(blocks, "blocks is null");
    this.dataSource = requireNonNull(dataSource, "dataSource is null");
    this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null");
    this.currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext();
    this.options = requireNonNull(options, "options is null");
    this.columnReaders = new PrimitiveColumnReader[columns.size()];
    this.maxBytesPerCell = new long[columns.size()];

    Map<ChunkKey, DiskRange> ranges = new HashMap<>();
    for (int rowGroup = 0; rowGroup < blocks.size(); rowGroup++) {
        BlockMetaData metadata = blocks.get(rowGroup);
        for (PrimitiveColumnIO column : columns) {
            int columnId = column.getId();
            ColumnChunkMetaData chunkMetadata = getColumnChunkMetaData(metadata, column.getColumnDescriptor());
            DiskRange range = new DiskRange(chunkMetadata.getStartingPos(), toIntExact(chunkMetadata.getTotalSize()));
            ranges.put(new ChunkKey(columnId, rowGroup), range);
        }
    }

    this.chunkReaders = dataSource.planRead(ranges);
}
 
Example #23
Source Project: presto   Author: prestosql   File: ParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData getColumnChunkMetaData(BlockMetaData blockMetaData, ColumnDescriptor columnDescriptor)
        throws IOException
{
    for (ColumnChunkMetaData metadata : blockMetaData.getColumns()) {
        if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
            return metadata;
        }
    }
    throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor);
}
 
Example #24
Source Project: flink   Author: flink-tpc-ds   File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Moves the reading position to the given block and seeks to and reads the given record.
 *
 * @param block The block to seek to.
 * @param recordInBlock The number of the record in the block to return next.
 */
public void seek(long block, long recordInBlock) throws IOException {

	List<BlockMetaData> blockMetaData = reader.getRowGroups();

	if (block == -1L && recordInBlock == -1L) {
		// the split was fully consumed
		currentBlock = blockMetaData.size() - 1;
		numReadRecords = numTotalRecords;
		numRecordsUpToCurrentBlock = numTotalRecords;
		return;
	}

	// init all counters for the start of the first block
	currentBlock = 0;
	numRecordsUpToPreviousBlock = 0;
	numRecordsUpToCurrentBlock = blockMetaData.get(0).getRowCount();
	numReadRecords = 0;

	// seek to the given block
	while (currentBlock < block) {
		currentBlock++;
		reader.skipNextRowGroup();
		numRecordsUpToPreviousBlock = numRecordsUpToCurrentBlock;
		numRecordsUpToCurrentBlock += blockMetaData.get(currentBlock).getRowCount();
		numReadRecords = numRecordsUpToPreviousBlock;
	}

	// seek to and read the given record
	PageReadStore pages = reader.readNextRowGroup();
	recordReader = createRecordReader(pages);
	for (int i = 0; i <= recordInBlock; i++) {
		readNextRecord();
	}
}
 
Example #25
Source Project: parquet-mr   Author: apache   File: ParquetMetadataCommand.java    License: Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);

  console.info("\nFile path:  {}", source);
  console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());

  Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
  if (kv != null && !kv.isEmpty()) {
    console.info("Properties:");
    String format = "  %" + maxSize(kv.keySet()) + "s: %s";
    for (Map.Entry<String, String> entry : kv.entrySet()) {
      console.info(String.format(format, entry.getKey(), entry.getValue()));
    }
  } else {
    console.info("Properties: (none)");
  }

  MessageType schema = footer.getFileMetaData().getSchema();
  console.info("Schema:\n{}", schema);

  List<BlockMetaData> rowGroups = footer.getBlocks();
  for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
    printRowGroup(console, index, rowGroups.get(index), schema);
  }

  console.info("");

  return 0;
}
 
Example #26
Source Project: dremio-oss   Author: dremio   File: LocalDictionariesReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException {
  // Passing the max footer length is not required in this case as the parquet reader would already have failed.
  final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER,
    ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}
 
Example #27
Source Project: dremio-oss   Author: dremio   File: UnifiedParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
private void computeLocality(ParquetMetadata footer) throws ExecutionSetupException {
  try {
    BlockMetaData block = footer.getBlocks().get(readEntry.getRowGroupIndex());

    Iterable<FileBlockLocation> blockLocations = fs.getFileBlockLocations(Path.of(readEntry.getPath()), block.getStartingPos(), block.getCompressedSize());

    String localHost = InetAddress.getLocalHost().getCanonicalHostName();

    List<Range<Long>> intersectingRanges = new ArrayList<>();

    Range<Long> rowGroupRange = Range.openClosed(block.getStartingPos(), block.getStartingPos() + block.getCompressedSize());

    for (FileBlockLocation loc : blockLocations) {
      for (String host : loc.getHosts()) {
        if (host.equals(localHost)) {
          intersectingRanges.add(Range.closedOpen(loc.getOffset(), loc.getOffset() + loc.getSize()).intersection(rowGroupRange));
        }
      }
    }

    long totalIntersect = 0;
    for (Range<Long> range : intersectingRanges) {
      totalIntersect += (range.upperEndpoint() - range.lowerEndpoint());
    }
    if (totalIntersect < block.getCompressedSize()) {
      context.getStats().addLongStat(Metric.NUM_REMOTE_READERS, 1);
    } else {
      context.getStats().addLongStat(Metric.NUM_REMOTE_READERS, 0);
    }
  } catch (IOException e) {
    throw new ExecutionSetupException(e);
  }
}
 
Example #28
Source Project: dremio-oss   Author: dremio   File: UnifiedParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<RecordReader> getReaders(final UnifiedParquetReader unifiedReader) throws ExecutionSetupException {
  final ParquetMetadata footer = unifiedReader.getFooter();
  final List<BlockMetaData> blocks = footer.getBlocks();
  final int rowGroupIdx = unifiedReader.readEntry.getRowGroupIndex();
  if (blocks.size() <= rowGroupIdx) {
    throw new IllegalArgumentException(
        String.format("Invalid rowgroup index in read entry. Given '%d', Max '%d'", rowGroupIdx, blocks.size())
    );
  }

  final long rowCount = blocks.get(rowGroupIdx).getRowCount();

  final RecordReader reader = new AbstractRecordReader(unifiedReader.context, Collections.<SchemaPath>emptyList()) {
    private long remainingRowCount = rowCount;

    @Override
    public void setup(OutputMutator output) throws ExecutionSetupException {

    }

    @Override
    public int next() {
      if (numRowsPerBatch > remainingRowCount) {
        int toReturn = (int) remainingRowCount;
        remainingRowCount = 0;
        return toReturn;
      }

      remainingRowCount -= numRowsPerBatch;
      return (int)numRowsPerBatch;
    }

    @Override
    public void close() throws Exception {

    }
  };
  return Collections.singletonList(reader);
}
 
Example #29
Source Project: flink   Author: apache   File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Moves the reading position to the given block and seeks to and reads the given record.
 *
 * @param block The block to seek to.
 * @param recordInBlock The number of the record in the block to return next.
 */
public void seek(long block, long recordInBlock) throws IOException {

	List<BlockMetaData> blockMetaData = reader.getRowGroups();

	if (block == -1L && recordInBlock == -1L) {
		// the split was fully consumed
		currentBlock = blockMetaData.size() - 1;
		numReadRecords = numTotalRecords;
		numRecordsUpToCurrentBlock = numTotalRecords;
		return;
	}

	// init all counters for the start of the first block
	currentBlock = 0;
	numRecordsUpToPreviousBlock = 0;
	numRecordsUpToCurrentBlock = blockMetaData.get(0).getRowCount();
	numReadRecords = 0;

	// seek to the given block
	while (currentBlock < block) {
		currentBlock++;
		reader.skipNextRowGroup();
		numRecordsUpToPreviousBlock = numRecordsUpToCurrentBlock;
		numRecordsUpToCurrentBlock += blockMetaData.get(currentBlock).getRowCount();
		numReadRecords = numRecordsUpToPreviousBlock;
	}

	// seek to and read the given record
	PageReadStore pages = reader.readNextRowGroup();
	recordReader = createRecordReader(pages);
	for (int i = 0; i <= recordInBlock; i++) {
		readNextRecord();
	}
}
 
Example #30
Source Project: flink   Author: apache   File: ParquetColumnarRowSplitReader.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetColumnarRowSplitReader(
		boolean utcTimestamp,
		boolean caseSensitive,
		Configuration conf,
		LogicalType[] selectedTypes,
		String[] selectedFieldNames,
		ColumnBatchGenerator generator,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	this.utcTimestamp = utcTimestamp;
	this.selectedTypes = selectedTypes;
	this.batchSize = batchSize;
	// then we need to apply the predicate push down filter
	ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength));
	MessageType fileSchema = footer.getFileMetaData().getSchema();
	FilterCompat.Filter filter = getFilter(conf);
	List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);

	this.fileSchema = footer.getFileMetaData().getSchema();
	this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive);
	this.reader = new ParquetFileReader(
			conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

	long totalRowCount = 0;
	for (BlockMetaData block : blocks) {
		totalRowCount += block.getRowCount();
	}
	this.totalRowCount = totalRowCount;
	this.nextRow = 0;
	this.rowsInBatch = 0;
	this.rowsReturned = 0;

	checkSchema();

	this.writableVectors = createWritableVectors();
	this.columnarBatch = generator.generate(createReadableVectors());
	this.row = new ColumnarRowData(columnarBatch);
}