org.apache.parquet.hadoop.metadata.BlockMetaData Java Examples
The following examples show how to use
org.apache.parquet.hadoop.metadata.BlockMetaData.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0 | 6 votes |
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) { // splitting files? if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) { // this is okay if not using DELTA_BYTE_ARRAY with the bug Set<Encoding> encodings = new HashSet<Encoding>(); for (ColumnChunkMetaData column : block.getColumns()) { encodings.addAll(column.getEncodings()); } for (Encoding encoding : encodings) { if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) { throw new ParquetDecodingException("Cannot read data due to " + "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false"); } } } }
Example #2
Source File: ParquetReaderUtility.java From dremio-oss with Apache License 2.0 | 6 votes |
/** * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input * format finds the row group numbers for input split. */ public static List<Integer> getRowGroupNumbersFromFileSplit(final long splitStart, final long splitLength, final ParquetMetadata footer) throws IOException { final List<BlockMetaData> blocks = footer.getBlocks(); final List<Integer> rowGroupNums = Lists.newArrayList(); int i = 0; for (final BlockMetaData block : blocks) { final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) { rowGroupNums.add(i); } i++; } return rowGroupNums; }
Example #3
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * @param configuration the Hadoop conf * @param fileMetaData fileMetaData for parquet file * @param filePath Path for the parquet file * @param blocks the blocks to read * @param columns the columns to read (their path) * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader( Configuration configuration, FileMetaData fileMetaData, Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException { this.converter = new ParquetMetadataConverter(configuration); this.file = HadoopInputFile.fromPath(filePath, configuration); this.fileMetaData = fileMetaData; this.f = file.newStream(); this.options = HadoopReadOptions.builder(configuration).build(); this.blocks = filterRowGroups(blocks); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : columns) { paths.put(ColumnPath.get(col.getPath()), col); } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; }
Example #4
Source File: TestMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testZeroRecordFile() { BlockMetaData emptyBlock = new BlockMetaData(); emptyBlock.setRowCount(0); Expression[] exprs = new Expression[] { lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), notNull("some_nulls") }; for (Expression expr : exprs) { boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr) .shouldRead(PARQUET_SCHEMA, emptyBlock); Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead); } }
Example #5
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } this.stats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); for (ColumnChunkMetaData col : rowGroup.getColumns()) { PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType(); if (colType.getId() != null) { int id = colType.getId().intValue(); stats.put(id, col.getStatistics()); valueCounts.put(id, col.getValueCount()); conversions.put(id, converterFromParquet(colType)); } } return ExpressionVisitors.visit(expr, this); }
Example #6
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); long numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema())); return fileMetaData; }
Example #7
Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0 | 6 votes |
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
Example #8
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void serializeColumnIndexes( List<List<ColumnIndex>> columnIndexes, List<BlockMetaData> blocks, PositionOutputStream out) throws IOException { LOG.debug("{}: column indexes", out.getPos()); for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns(); List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex); for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { ColumnChunkMetaData column = columns.get(cIndex); org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex)); if (columnIndex == null) { continue; } long offset = out.getPos(); Util.writeColumnIndex(columnIndex, out); column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset))); } } }
Example #9
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void serializeBloomFilters( List<Map<String, BloomFilter>> bloomFilters, List<BlockMetaData> blocks, PositionOutputStream out) throws IOException { LOG.debug("{}: bloom filters", out.getPos()); for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns(); Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex); if (blockBloomFilters.isEmpty()) continue; for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { ColumnChunkMetaData column = columns.get(cIndex); BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString()); if (bloomFilter == null) { continue; } long offset = out.getPos(); column.setBloomFilterOffset(offset); Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out); bloomFilter.writeTo(out); } } }
Example #10
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * Given a list of metadata files, merge them into a single ParquetMetadata * Requires that the schemas be compatible, and the extraMetadata be exactly equal. * @param files a list of files to merge metadata from * @param conf a configuration * @return merged parquet metadata for the files * @throws IOException if there is an error while writing * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException { Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata"); GlobalMetaData globalMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Path p : files) { ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER); FileMetaData fmd = pmd.getFileMetaData(); globalMetaData = mergeInto(fmd, globalMetaData, true); blocks.addAll(pmd.getBlocks()); } // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible return new ParquetMetadata(globalMetaData.merge(), blocks); }
Example #11
Source File: ParquetReader.java From tajo with Apache License 2.0 | 6 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); for (Footer footer : footers) { for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) { totalRowCount += block.getRowCount(); } } }
Example #12
Source File: ReadConf.java From iceberg with Apache License 2.0 | 6 votes |
private List<Map<ColumnPath, ColumnChunkMetaData>> getColumnChunkMetadataForRowGroups() { Set<ColumnPath> projectedColumns = projection.getColumns().stream() .map(columnDescriptor -> ColumnPath.get(columnDescriptor.getPath())).collect(Collectors.toSet()); ImmutableList.Builder<Map<ColumnPath, ColumnChunkMetaData>> listBuilder = ImmutableList.builder(); for (int i = 0; i < rowGroups.size(); i++) { if (!shouldSkip[i]) { BlockMetaData blockMetaData = rowGroups.get(i); ImmutableMap.Builder<ColumnPath, ColumnChunkMetaData> mapBuilder = ImmutableMap.builder(); blockMetaData.getColumns().stream() .filter(columnChunkMetaData -> projectedColumns.contains(columnChunkMetaData.getPath())) .forEach(columnChunkMetaData -> mapBuilder.put(columnChunkMetaData.getPath(), columnChunkMetaData)); listBuilder.add(mapBuilder.build()); } else { listBuilder.add(ImmutableMap.of()); } } return listBuilder.build(); }
Example #13
Source File: PrintFooter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }
Example #14
Source File: TestMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testZeroRecordFileParquet() { Assume.assumeTrue(format == FileFormat.PARQUET); BlockMetaData emptyBlock = new BlockMetaData(); emptyBlock.setRowCount(0); Expression[] exprs = new Expression[] { lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), notNull("some_nulls") }; for (Expression expr : exprs) { boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock); Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead); } }
Example #15
Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * groups together all the data blocks for the same HDFS block * * @param rowGroupBlocks data blocks (row groups) * @param hdfsBlocksArray hdfs blocks * @param fileStatus the containing file * @param requestedSchema the schema requested by the user * @param readSupportMetadata the metadata provided by the readSupport implementation in init * @param minSplitSize the mapred.min.split.size * @param maxSplitSize the mapred.max.split.size * @return the splits (one per HDFS block) * @throws IOException If hosts can't be retrieved for the HDFS block */ static <T> List<ParquetInputSplit> generateSplits( List<BlockMetaData> rowGroupBlocks, BlockLocation[] hdfsBlocksArray, FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata, long minSplitSize, long maxSplitSize) throws IOException { List<SplitInfo> splitRowGroups = generateSplitInfo(rowGroupBlocks, hdfsBlocksArray, minSplitSize, maxSplitSize); //generate splits from rowGroups of each split List<ParquetInputSplit> resultSplits = new ArrayList<ParquetInputSplit>(); for (SplitInfo splitInfo : splitRowGroups) { ParquetInputSplit split = splitInfo.getParquetInputSplit(fileStatus, requestedSchema, readSupportMetadata); resultSplits.add(split); } return resultSplits; }
Example #16
Source File: PredicateUtils.java From presto with Apache License 2.0 | 6 votes |
private static boolean dictionaryPredicatesMatch(Predicate parquetPredicate, BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain) { for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { if (isOnlyDictionaryEncodingPages(columnMetaData) && isColumnPredicate(descriptor, parquetTupleDomain)) { byte[] buffer = new byte[toIntExact(columnMetaData.getTotalSize())]; dataSource.readFully(columnMetaData.getStartingPos(), buffer); // Early abort, predicate already filters block so no more dictionaries need be read if (!parquetPredicate.matches(new DictionaryDescriptor(descriptor, readDictionaryPage(buffer, columnMetaData.getCodec())))) { return false; } } } } return true; }
Example #17
Source File: CompressionConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, String createdBy, CompressionCodecName codecName) throws IOException { int blockIndex = 0; PageReadStore store = reader.readNextRowGroup(); while (store != null) { writer.startBlock(store.getRowCount()); BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex); List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns(); Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect( Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x)); for (int i = 0; i < columnsInOrder.size(); i += 1) { ColumnChunkMetaData chunk = columnsInOrder.get(i); ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy); ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath()); writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName); processChunk(reader, writer, chunk, createdBy, codecName); writer.endColumn(); } writer.endBlock(); store = reader.readNextRowGroup(); blockIndex++; } }
Example #18
Source File: ParquetInputSplit.java From parquet-mr with Apache License 2.0 | 5 votes |
private static long end(List<BlockMetaData> blocks, String requestedSchema) { MessageType requested = MessageTypeParser.parseMessageType(requestedSchema); long length = 0; for (BlockMetaData block : blocks) { List<ColumnChunkMetaData> columns = block.getColumns(); for (ColumnChunkMetaData column : columns) { if (requested.containsPath(column.getPath().toArray())) { length += column.getTotalSize(); } } } return length; }
Example #19
Source File: ColumnIndexStoreImpl.java From parquet-mr with Apache License 2.0 | 5 votes |
static ColumnIndexStore create(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) { try { return new ColumnIndexStoreImpl(reader, block, paths); } catch (MissingOffsetIndexException e) { return EMPTY; } }
Example #20
Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0 | 5 votes |
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException { MessageType requested = MessageTypeParser.parseMessageType(requestedSchema); long length = 0; for (BlockMetaData block : this.getRowGroups()) { List<ColumnChunkMetaData> columns = block.getColumns(); for (ColumnChunkMetaData column : columns) { if (requested.containsPath(column.getPath().toArray())) { length += column.getTotalSize(); } } } BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1); long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize(); long[] rowGroupOffsets = new long[this.getRowGroupCount()]; for (int i = 0; i < rowGroupOffsets.length; i++) { rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos(); } return new ParquetInputSplit( fileStatus.getPath(), hdfsBlock.getOffset(), end, length, hdfsBlock.getHosts(), rowGroupOffsets ); }
Example #21
Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void checkSorted(List<BlockMetaData> rowGroupBlocks) { long previousOffset = 0L; for(BlockMetaData rowGroup: rowGroupBlocks) { long currentOffset = rowGroup.getStartingPos(); if (currentOffset < previousOffset) { throw new ParquetDecodingException("row groups are not sorted: previous row groups starts at " + previousOffset + ", current row group starts at " + currentOffset); } } }
Example #22
Source File: BloomFilterReader.java From parquet-mr with Apache License 2.0 | 5 votes |
public BloomFilterReader(ParquetFileReader fileReader, BlockMetaData block) { this.reader = fileReader; this.columns = new HashMap<>(); for (ColumnChunkMetaData column : block.getColumns()) { columns.put(column.getPath(), column); } }
Example #23
Source File: ParquetInputSplit.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * For compatibility only * use {@link ParquetInputSplit#ParquetInputSplit(Path, long, long, long, String[], long[])} * @param path a Path * @param start split start location * @param length split length * @param hosts locality information for this split * @param blocks Parquet blocks in this split * @param requestedSchema the requested schema * @param fileSchema the file schema * @param extraMetadata string map of file metadata * @param readSupportMetadata string map of metadata from read support */ @Deprecated public ParquetInputSplit( Path path, long start, long length, String[] hosts, List<BlockMetaData> blocks, String requestedSchema, String fileSchema, Map<String, String> extraMetadata, Map<String, String> readSupportMetadata) { this(path, start, end(blocks, requestedSchema), length, hosts, offsets(blocks)); }
Example #24
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
static void showDetails(PrettyPrintWriter out, ParquetMetadata meta, boolean showOriginalTypes) { showDetails(out, meta.getFileMetaData(), showOriginalTypes); long i = 1; for (BlockMetaData bmeta : meta.getBlocks()) { out.println(); showDetails(out, bmeta, i++); } }
Example #25
Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException { Map<String, Long> colSizes = new HashMap<>(); ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER); for (BlockMetaData block : pmd.getBlocks()) { for (ColumnChunkMetaData column : block.getColumns()) { String colName = column.getPath().toDotString(); colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L)); } } return colSizes; }
Example #26
Source File: RowCountCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; out = new PrintWriter(Main.out, true); inputPath = new Path(input); conf = new Configuration(); inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); long rowCount = 0; for (FileStatus fs : inputFileStatuses) { long fileRowCount=0; for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) { for (BlockMetaData b : f.getParquetMetadata().getBlocks()) { rowCount += b.getRowCount(); fileRowCount += b.getRowCount(); } } if (options.hasOption('d')) { out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount); } } out.format("Total RowCount: %d", rowCount); out.println(); }
Example #27
Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException { Map<String, Long> colSizes = new HashMap<>(); ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER); for (BlockMetaData block : pmd.getBlocks()) { for (ColumnChunkMetaData column : block.getColumns()) { String colName = column.getPath().toDotString(); colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L)); } } return colSizes; }
Example #28
Source File: ReadState.java From Bats with Apache License 2.0 | 5 votes |
/** * Create the readers needed to read columns: fixed-length or variable length. * * @param reader * @param output * @throws Exception */ @SuppressWarnings("unchecked") public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception { final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>(); // initialize all of the column read status objects BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata(); Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata); for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) { ColumnDescriptor column = columnMetadata.column; columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get( columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath()))); columnMetadata.buildVector(output); if (! columnMetadata.isFixedLength( )) { // create a reader and add it to the appropriate list varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader)); } else if (columnMetadata.isRepeated()) { varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader)); } else { fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader)); } } varLengthReader = new VarLenBinaryReader(reader, varLengthColumns); if (! schema.isStarQuery()) { schema.createNonExistentColumns(output, nullFilledVectors); } }
Example #29
Source File: SizeCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; out = new PrintWriter(Main.out, true); inputPath = new Path(input); conf = new Configuration(); inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); long size = 0; for (FileStatus fs : inputFileStatuses) { long fileSize = 0; for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) { for (BlockMetaData b : f.getParquetMetadata().getBlocks()) { size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize()); fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize()); } } if (options.hasOption('d')) { if (options.hasOption('p')) { out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize)); } else { out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize); } } } if (options.hasOption('p')) { out.format("Total Size: %s", getPrettySize(size)); } else { out.format("Total Size: %d bytes", size); } out.println(); }
Example #30
Source File: DictionaryPageReader.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * Instantiate a new DictionaryPageReader. * * @param reader The target ParquetFileReader * @param block The target BlockMetaData * * @throws NullPointerException if {@code reader} or {@code block} is * {@code null} */ DictionaryPageReader(ParquetFileReader reader, BlockMetaData block) { this.reader = Objects.requireNonNull(reader); this.columns = new HashMap<>(); this.dictionaryPageCache = new ConcurrentHashMap<>(); for (ColumnChunkMetaData column : block.getColumns()) { columns.put(column.getPath().toDotString(), column); } }