Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData
The following examples show how to use
org.apache.parquet.hadoop.metadata.BlockMetaData.
These examples are extracted from open source projects.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: presto Author: prestosql File: PredicateUtils.java License: Apache License 2.0 | 6 votes |
private static boolean dictionaryPredicatesMatch(Predicate parquetPredicate, BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain) { for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { if (isOnlyDictionaryEncodingPages(columnMetaData) && isColumnPredicate(descriptor, parquetTupleDomain)) { byte[] buffer = new byte[toIntExact(columnMetaData.getTotalSize())]; dataSource.readFully(columnMetaData.getStartingPos(), buffer); // Early abort, predicate already filters block so no more dictionaries need be read if (!parquetPredicate.matches(new DictionaryDescriptor(descriptor, readDictionaryPage(buffer, columnMetaData.getCodec())))) { return false; } } } } return true; }
Example #2
Source Project: iceberg Author: apache File: TestMetricsRowGroupFilter.java License: Apache License 2.0 | 6 votes |
@Test public void testZeroRecordFileParquet() { Assume.assumeTrue(format == FileFormat.PARQUET); BlockMetaData emptyBlock = new BlockMetaData(); emptyBlock.setRowCount(0); Expression[] exprs = new Expression[] { lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), notNull("some_nulls") }; for (Expression expr : exprs) { boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock); Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead); } }
Example #3
Source Project: iceberg Author: apache File: ReadConf.java License: Apache License 2.0 | 6 votes |
private List<Map<ColumnPath, ColumnChunkMetaData>> getColumnChunkMetadataForRowGroups() { Set<ColumnPath> projectedColumns = projection.getColumns().stream() .map(columnDescriptor -> ColumnPath.get(columnDescriptor.getPath())).collect(Collectors.toSet()); ImmutableList.Builder<Map<ColumnPath, ColumnChunkMetaData>> listBuilder = ImmutableList.builder(); for (int i = 0; i < rowGroups.size(); i++) { if (!shouldSkip[i]) { BlockMetaData blockMetaData = rowGroups.get(i); ImmutableMap.Builder<ColumnPath, ColumnChunkMetaData> mapBuilder = ImmutableMap.builder(); blockMetaData.getColumns().stream() .filter(columnChunkMetaData -> projectedColumns.contains(columnChunkMetaData.getPath())) .forEach(columnChunkMetaData -> mapBuilder.put(columnChunkMetaData.getPath(), columnChunkMetaData)); listBuilder.add(mapBuilder.build()); } else { listBuilder.add(ImmutableMap.of()); } } return listBuilder.build(); }
Example #4
Source Project: parquet-mr Author: apache File: ParquetMetadataConverter.java License: Apache License 2.0 | 6 votes |
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); long numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema())); return fileMetaData; }
Example #5
Source Project: iceberg Author: Netflix File: ParquetMetricsRowGroupFilter.java License: Apache License 2.0 | 6 votes |
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } this.stats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); for (ColumnChunkMetaData col : rowGroup.getColumns()) { PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType(); if (colType.getId() != null) { int id = colType.getId().intValue(); stats.put(id, col.getStatistics()); valueCounts.put(id, col.getValueCount()); conversions.put(id, converterFromParquet(colType)); } } return ExpressionVisitors.visit(expr, this); }
Example #6
Source Project: iceberg Author: Netflix File: TestMetricsRowGroupFilter.java License: Apache License 2.0 | 6 votes |
@Test public void testZeroRecordFile() { BlockMetaData emptyBlock = new BlockMetaData(); emptyBlock.setRowCount(0); Expression[] exprs = new Expression[] { lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), notNull("some_nulls") }; for (Expression expr : exprs) { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr) .shouldRead(PARQUET_SCHEMA, emptyBlock); Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead); } }
Example #7
Source Project: dremio-oss Author: dremio File: ParquetReaderUtility.java License: Apache License 2.0 | 6 votes |
/** * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input * format finds the row group numbers for input split. */ public static List<Integer> getRowGroupNumbersFromFileSplit(final long splitStart, final long splitLength, final ParquetMetadata footer) throws IOException { final List<BlockMetaData> blocks = footer.getBlocks(); final List<Integer> rowGroupNums = Lists.newArrayList(); int i = 0; for (final BlockMetaData block : blocks) { final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) { rowGroupNums.add(i); } i++; } return rowGroupNums; }
Example #8
Source Project: tajo Author: apache File: InternalParquetRecordReader.java License: Apache License 2.0 | 6 votes |
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
Example #9
Source Project: tajo Author: apache File: ParquetReader.java License: Apache License 2.0 | 6 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); for (Footer footer : footers) { for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) { totalRowCount += block.getRowCount(); } } }
Example #10
Source Project: parquet-mr Author: apache File: PrintFooter.java License: Apache License 2.0 | 6 votes |
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }
Example #11
Source Project: parquet-mr Author: apache File: ParquetFileReader.java License: Apache License 2.0 | 6 votes |
/** * @param configuration the Hadoop conf * @param fileMetaData fileMetaData for parquet file * @param filePath Path for the parquet file * @param blocks the blocks to read * @param columns the columns to read (their path) * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader( Configuration configuration, FileMetaData fileMetaData, Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException { this.converter = new ParquetMetadataConverter(configuration); this.file = HadoopInputFile.fromPath(filePath, configuration); this.fileMetaData = fileMetaData; this.f = file.newStream(); this.options = HadoopReadOptions.builder(configuration).build(); this.blocks = filterRowGroups(blocks); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : columns) { paths.put(ColumnPath.get(col.getPath()), col); } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; }
Example #12
Source Project: parquet-mr Author: apache File: ParquetFileWriter.java License: Apache License 2.0 | 6 votes |
private static void serializeColumnIndexes( List<List<ColumnIndex>> columnIndexes, List<BlockMetaData> blocks, PositionOutputStream out) throws IOException { LOG.debug("{}: column indexes", out.getPos()); for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns(); List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex); for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { ColumnChunkMetaData column = columns.get(cIndex); org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex)); if (columnIndex == null) { continue; } long offset = out.getPos(); Util.writeColumnIndex(columnIndex, out); column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset))); } } }
Example #13
Source Project: parquet-mr Author: apache File: ParquetFileWriter.java License: Apache License 2.0 | 6 votes |
private static void serializeBloomFilters( List<Map<String, BloomFilter>> bloomFilters, List<BlockMetaData> blocks, PositionOutputStream out) throws IOException { LOG.debug("{}: bloom filters", out.getPos()); for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns(); Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex); if (blockBloomFilters.isEmpty()) continue; for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { ColumnChunkMetaData column = columns.get(cIndex); BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString()); if (bloomFilter == null) { continue; } long offset = out.getPos(); column.setBloomFilterOffset(offset); Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out); bloomFilter.writeTo(out); } } }
Example #14
Source Project: parquet-mr Author: apache File: ParquetFileWriter.java License: Apache License 2.0 | 6 votes |
/** * Given a list of metadata files, merge them into a single ParquetMetadata * Requires that the schemas be compatible, and the extraMetadata be exactly equal. * @param files a list of files to merge metadata from * @param conf a configuration * @return merged parquet metadata for the files * @throws IOException if there is an error while writing * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException { Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata"); GlobalMetaData globalMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Path p : files) { ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER); FileMetaData fmd = pmd.getFileMetaData(); globalMetaData = mergeInto(fmd, globalMetaData, true); blocks.addAll(pmd.getBlocks()); } // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible return new ParquetMetadata(globalMetaData.merge(), blocks); }
Example #15
Source Project: parquet-mr Author: apache File: ParquetInputFormat.java License: Apache License 2.0 | 6 votes |
/** * groups together all the data blocks for the same HDFS block * * @param rowGroupBlocks data blocks (row groups) * @param hdfsBlocksArray hdfs blocks * @param fileStatus the containing file * @param requestedSchema the schema requested by the user * @param readSupportMetadata the metadata provided by the readSupport implementation in init * @param minSplitSize the mapred.min.split.size * @param maxSplitSize the mapred.max.split.size * @return the splits (one per HDFS block) * @throws IOException If hosts can't be retrieved for the HDFS block */ static <T> List<ParquetInputSplit> generateSplits( List<BlockMetaData> rowGroupBlocks, BlockLocation[] hdfsBlocksArray, FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata, long minSplitSize, long maxSplitSize) throws IOException { List<SplitInfo> splitRowGroups = generateSplitInfo(rowGroupBlocks, hdfsBlocksArray, minSplitSize, maxSplitSize); //generate splits from rowGroups of each split List<ParquetInputSplit> resultSplits = new ArrayList<ParquetInputSplit>(); for (SplitInfo splitInfo : splitRowGroups) { ParquetInputSplit split = splitInfo.getParquetInputSplit(fileStatus, requestedSchema, readSupportMetadata); resultSplits.add(split); } return resultSplits; }
Example #16
Source Project: parquet-mr Author: apache File: CompressionConverter.java License: Apache License 2.0 | 6 votes |
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, String createdBy, CompressionCodecName codecName) throws IOException { int blockIndex = 0; PageReadStore store = reader.readNextRowGroup(); while (store != null) { writer.startBlock(store.getRowCount()); BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex); List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns(); Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect( Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x)); for (int i = 0; i < columnsInOrder.size(); i += 1) { ColumnChunkMetaData chunk = columnsInOrder.get(i); ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy); ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath()); writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName); processChunk(reader, writer, chunk, createdBy, codecName); writer.endColumn(); } writer.endBlock(); store = reader.readNextRowGroup(); blockIndex++; } }
Example #17
Source Project: parquet-mr Author: apache File: ParquetRecordReader.java License: Apache License 2.0 | 6 votes |
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) { // splitting files? if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) { // this is okay if not using DELTA_BYTE_ARRAY with the bug Set<Encoding> encodings = new HashSet<Encoding>(); for (ColumnChunkMetaData column : block.getColumns()) { encodings.addAll(column.getEncodings()); } for (Encoding encoding : encodings) { if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) { throw new ParquetDecodingException("Cannot read data due to " + "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false"); } } } }
Example #18
Source Project: Bats Author: lealone File: ReadState.java License: Apache License 2.0 | 5 votes |
/** * Create the readers needed to read columns: fixed-length or variable length. * * @param reader * @param output * @throws Exception */ @SuppressWarnings("unchecked") public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception { final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>(); // initialize all of the column read status objects BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata(); Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata); for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) { ColumnDescriptor column = columnMetadata.column; columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get( columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath()))); columnMetadata.buildVector(output); if (! columnMetadata.isFixedLength( )) { // create a reader and add it to the appropriate list varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader)); } else if (columnMetadata.isRepeated()) { varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader)); } else { fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader)); } } varLengthReader = new VarLenBinaryReader(reader, varLengthColumns); if (! schema.isStarQuery()) { schema.createNonExistentColumns(output, nullFilledVectors); } }
Example #19
Source Project: Bats Author: lealone File: ParquetSchema.java License: Apache License 2.0 | 5 votes |
Map<String, Integer> buildChunkMap(BlockMetaData rowGroupMetadata) { // the column chunk meta-data is not guaranteed to be in the same order as the columns in the schema // a map is constructed for fast access to the correct columnChunkMetadata to correspond // to an element in the schema Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>(); int colChunkIndex = 0; for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) { columnChunkMetadataPositionsInList.put(Arrays.toString(colChunk.getPath().toArray()), colChunkIndex); colChunkIndex++; } return columnChunkMetadataPositionsInList; }
Example #20
Source Project: presto Author: prestosql File: PredicateUtils.java License: Apache License 2.0 | 5 votes |
public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { Map<ColumnDescriptor, Statistics<?>> columnStatistics = getStatistics(block, descriptorsByPath); if (!parquetPredicate.matches(block.getRowCount(), columnStatistics, dataSource.getId(), failOnCorruptedParquetStatistics)) { return false; } return dictionaryPredicatesMatch(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain); }
Example #21
Source Project: presto Author: prestosql File: PredicateUtils.java License: Apache License 2.0 | 5 votes |
private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath) { ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder(); for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { Statistics<?> columnStatistics = columnMetaData.getStatistics(); if (columnStatistics != null) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { statistics.put(descriptor, columnStatistics); } } } return statistics.build(); }
Example #22
Source Project: presto Author: prestosql File: ParquetReader.java License: Apache License 2.0 | 5 votes |
public ParquetReader( Optional<String> fileCreatedBy, MessageColumnIO messageColumnIO, List<BlockMetaData> blocks, ParquetDataSource dataSource, AggregatedMemoryContext systemMemoryContext, ParquetReaderOptions options) throws IOException { this.fileCreatedBy = requireNonNull(fileCreatedBy, "fileCreatedBy is null"); this.columns = requireNonNull(messageColumnIO, "messageColumnIO is null").getLeaves(); this.blocks = requireNonNull(blocks, "blocks is null"); this.dataSource = requireNonNull(dataSource, "dataSource is null"); this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null"); this.currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext(); this.options = requireNonNull(options, "options is null"); this.columnReaders = new PrimitiveColumnReader[columns.size()]; this.maxBytesPerCell = new long[columns.size()]; Map<ChunkKey, DiskRange> ranges = new HashMap<>(); for (int rowGroup = 0; rowGroup < blocks.size(); rowGroup++) { BlockMetaData metadata = blocks.get(rowGroup); for (PrimitiveColumnIO column : columns) { int columnId = column.getId(); ColumnChunkMetaData chunkMetadata = getColumnChunkMetaData(metadata, column.getColumnDescriptor()); DiskRange range = new DiskRange(chunkMetadata.getStartingPos(), toIntExact(chunkMetadata.getTotalSize())); ranges.put(new ChunkKey(columnId, rowGroup), range); } } this.chunkReaders = dataSource.planRead(ranges); }
Example #23
Source Project: presto Author: prestosql File: ParquetReader.java License: Apache License 2.0 | 5 votes |
private ColumnChunkMetaData getColumnChunkMetaData(BlockMetaData blockMetaData, ColumnDescriptor columnDescriptor) throws IOException { for (ColumnChunkMetaData metadata : blockMetaData.getColumns()) { if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) { return metadata; } } throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor); }
Example #24
Source Project: flink Author: flink-tpc-ds File: ParquetRecordReader.java License: Apache License 2.0 | 5 votes |
/** * Moves the reading position to the given block and seeks to and reads the given record. * * @param block The block to seek to. * @param recordInBlock The number of the record in the block to return next. */ public void seek(long block, long recordInBlock) throws IOException { List<BlockMetaData> blockMetaData = reader.getRowGroups(); if (block == -1L && recordInBlock == -1L) { // the split was fully consumed currentBlock = blockMetaData.size() - 1; numReadRecords = numTotalRecords; numRecordsUpToCurrentBlock = numTotalRecords; return; } // init all counters for the start of the first block currentBlock = 0; numRecordsUpToPreviousBlock = 0; numRecordsUpToCurrentBlock = blockMetaData.get(0).getRowCount(); numReadRecords = 0; // seek to the given block while (currentBlock < block) { currentBlock++; reader.skipNextRowGroup(); numRecordsUpToPreviousBlock = numRecordsUpToCurrentBlock; numRecordsUpToCurrentBlock += blockMetaData.get(currentBlock).getRowCount(); numReadRecords = numRecordsUpToPreviousBlock; } // seek to and read the given record PageReadStore pages = reader.readNextRowGroup(); recordReader = createRecordReader(pages); for (int i = 0; i <= recordInBlock; i++) { readNextRecord(); } }
Example #25
Source Project: parquet-mr Author: apache File: ParquetMetadataCommand.java License: Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public int run() throws IOException { Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required."); Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files."); String source = targets.get(0); ParquetMetadata footer = ParquetFileReader.readFooter( getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER); console.info("\nFile path: {}", source); console.info("Created by: {}", footer.getFileMetaData().getCreatedBy()); Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData(); if (kv != null && !kv.isEmpty()) { console.info("Properties:"); String format = " %" + maxSize(kv.keySet()) + "s: %s"; for (Map.Entry<String, String> entry : kv.entrySet()) { console.info(String.format(format, entry.getKey(), entry.getValue())); } } else { console.info("Properties: (none)"); } MessageType schema = footer.getFileMetaData().getSchema(); console.info("Schema:\n{}", schema); List<BlockMetaData> rowGroups = footer.getBlocks(); for (int index = 0, n = rowGroups.size(); index < n; index += 1) { printRowGroup(console, index, rowGroups.get(index), schema); } console.info(""); return 0; }
Example #26
Source Project: dremio-oss Author: dremio File: LocalDictionariesReader.java License: Apache License 2.0 | 5 votes |
/** * Return dictionary per row group for all binary columns in given parquet file. * @param fs filesystem object. * @param filePath parquet file to scan * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded. * @throws IOException */ public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException { // Passing the max footer length is not required in this case as the parquet reader would already have failed. final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER, ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal()); if (parquetMetadata.getBlocks().size() > 1) { throw new IOException( format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s", parquetMetadata.getBlocks().size(), filePath)); } final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0); final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap(); for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) { columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor); } final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap(); try(final FSInputStream in = fs.open(filePath)) { for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) { if (isBinaryType(columnChunkMetaData.getType())) { final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath()); // if first page is dictionary encoded then load dictionary, otherwise skip this column. final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0); if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) { dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec()))); } else { columnsToSkip.add(column); } } } } return new ImmutablePair<>(dictionaries, columnsToSkip); }
Example #27
Source Project: dremio-oss Author: dremio File: UnifiedParquetReader.java License: Apache License 2.0 | 5 votes |
private void computeLocality(ParquetMetadata footer) throws ExecutionSetupException { try { BlockMetaData block = footer.getBlocks().get(readEntry.getRowGroupIndex()); Iterable<FileBlockLocation> blockLocations = fs.getFileBlockLocations(Path.of(readEntry.getPath()), block.getStartingPos(), block.getCompressedSize()); String localHost = InetAddress.getLocalHost().getCanonicalHostName(); List<Range<Long>> intersectingRanges = new ArrayList<>(); Range<Long> rowGroupRange = Range.openClosed(block.getStartingPos(), block.getStartingPos() + block.getCompressedSize()); for (FileBlockLocation loc : blockLocations) { for (String host : loc.getHosts()) { if (host.equals(localHost)) { intersectingRanges.add(Range.closedOpen(loc.getOffset(), loc.getOffset() + loc.getSize()).intersection(rowGroupRange)); } } } long totalIntersect = 0; for (Range<Long> range : intersectingRanges) { totalIntersect += (range.upperEndpoint() - range.lowerEndpoint()); } if (totalIntersect < block.getCompressedSize()) { context.getStats().addLongStat(Metric.NUM_REMOTE_READERS, 1); } else { context.getStats().addLongStat(Metric.NUM_REMOTE_READERS, 0); } } catch (IOException e) { throw new ExecutionSetupException(e); } }
Example #28
Source Project: dremio-oss Author: dremio File: UnifiedParquetReader.java License: Apache License 2.0 | 5 votes |
@Override public List<RecordReader> getReaders(final UnifiedParquetReader unifiedReader) throws ExecutionSetupException { final ParquetMetadata footer = unifiedReader.getFooter(); final List<BlockMetaData> blocks = footer.getBlocks(); final int rowGroupIdx = unifiedReader.readEntry.getRowGroupIndex(); if (blocks.size() <= rowGroupIdx) { throw new IllegalArgumentException( String.format("Invalid rowgroup index in read entry. Given '%d', Max '%d'", rowGroupIdx, blocks.size()) ); } final long rowCount = blocks.get(rowGroupIdx).getRowCount(); final RecordReader reader = new AbstractRecordReader(unifiedReader.context, Collections.<SchemaPath>emptyList()) { private long remainingRowCount = rowCount; @Override public void setup(OutputMutator output) throws ExecutionSetupException { } @Override public int next() { if (numRowsPerBatch > remainingRowCount) { int toReturn = (int) remainingRowCount; remainingRowCount = 0; return toReturn; } remainingRowCount -= numRowsPerBatch; return (int)numRowsPerBatch; } @Override public void close() throws Exception { } }; return Collections.singletonList(reader); }
Example #29
Source Project: flink Author: apache File: ParquetRecordReader.java License: Apache License 2.0 | 5 votes |
/** * Moves the reading position to the given block and seeks to and reads the given record. * * @param block The block to seek to. * @param recordInBlock The number of the record in the block to return next. */ public void seek(long block, long recordInBlock) throws IOException { List<BlockMetaData> blockMetaData = reader.getRowGroups(); if (block == -1L && recordInBlock == -1L) { // the split was fully consumed currentBlock = blockMetaData.size() - 1; numReadRecords = numTotalRecords; numRecordsUpToCurrentBlock = numTotalRecords; return; } // init all counters for the start of the first block currentBlock = 0; numRecordsUpToPreviousBlock = 0; numRecordsUpToCurrentBlock = blockMetaData.get(0).getRowCount(); numReadRecords = 0; // seek to the given block while (currentBlock < block) { currentBlock++; reader.skipNextRowGroup(); numRecordsUpToPreviousBlock = numRecordsUpToCurrentBlock; numRecordsUpToCurrentBlock += blockMetaData.get(currentBlock).getRowCount(); numReadRecords = numRecordsUpToPreviousBlock; } // seek to and read the given record PageReadStore pages = reader.readNextRowGroup(); recordReader = createRecordReader(pages); for (int i = 0; i <= recordInBlock; i++) { readNextRecord(); } }
Example #30
Source Project: flink Author: apache File: ParquetColumnarRowSplitReader.java License: Apache License 2.0 | 5 votes |
public ParquetColumnarRowSplitReader( boolean utcTimestamp, boolean caseSensitive, Configuration conf, LogicalType[] selectedTypes, String[] selectedFieldNames, ColumnBatchGenerator generator, int batchSize, Path path, long splitStart, long splitLength) throws IOException { this.utcTimestamp = utcTimestamp; this.selectedTypes = selectedTypes; this.batchSize = batchSize; // then we need to apply the predicate push down filter ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength)); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(conf); List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.fileSchema = footer.getFileMetaData().getSchema(); this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); this.reader = new ParquetFileReader( conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); long totalRowCount = 0; for (BlockMetaData block : blocks) { totalRowCount += block.getRowCount(); } this.totalRowCount = totalRowCount; this.nextRow = 0; this.rowsInBatch = 0; this.rowsReturned = 0; checkSchema(); this.writableVectors = createWritableVectors(); this.columnarBatch = generator.generate(createReadableVectors()); this.row = new ColumnarRowData(columnarBatch); }