Java Code Examples for org.apache.parquet.hadoop.metadata.ColumnChunkMetaData
The following examples show how to use
org.apache.parquet.hadoop.metadata.ColumnChunkMetaData.
These examples are extracted from open source projects.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: parquet-mr Author: apache File: PrintFooter.java License: Apache License 2.0 | 6 votes |
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }
Example #2
Source Project: parquet-mr Author: apache File: ColumnIndexFilterUtils.java License: Apache License 2.0 | 6 votes |
static List<OffsetRange> calculateOffsetRanges(OffsetIndex offsetIndex, ColumnChunkMetaData cm, long firstPageOffset) { List<OffsetRange> ranges = new ArrayList<>(); int n = offsetIndex.getPageCount(); if (n > 0) { OffsetRange currentRange = null; // Add a range for the dictionary page if required long rowGroupOffset = cm.getStartingPos(); if (rowGroupOffset < firstPageOffset) { currentRange = new OffsetRange(rowGroupOffset, (int) (firstPageOffset - rowGroupOffset)); ranges.add(currentRange); } for (int i = 0; i < n; ++i) { long offset = offsetIndex.getOffset(i); int length = offsetIndex.getCompressedPageSize(i); if (currentRange == null || !currentRange.extend(offset, length)) { currentRange = new OffsetRange(offset, length); ranges.add(currentRange); } } } return ranges; }
Example #3
Source Project: iceberg Author: Netflix File: ParquetMetricsRowGroupFilter.java License: Apache License 2.0 | 6 votes |
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } this.stats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); for (ColumnChunkMetaData col : rowGroup.getColumns()) { PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType(); if (colType.getId() != null) { int id = colType.getId().intValue(); stats.put(id, col.getStatistics()); valueCounts.put(id, col.getValueCount()); conversions.put(id, converterFromParquet(colType)); } } return ExpressionVisitors.visit(expr, this); }
Example #4
Source Project: parquet-mr Author: apache File: ParquetFileWriter.java License: Apache License 2.0 | 6 votes |
private static void serializeBloomFilters( List<Map<String, BloomFilter>> bloomFilters, List<BlockMetaData> blocks, PositionOutputStream out) throws IOException { LOG.debug("{}: bloom filters", out.getPos()); for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns(); Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex); if (blockBloomFilters.isEmpty()) continue; for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { ColumnChunkMetaData column = columns.get(cIndex); BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString()); if (bloomFilter == null) { continue; } long offset = out.getPos(); column.setBloomFilterOffset(offset); Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out); bloomFilter.writeTo(out); } } }
Example #5
Source Project: Bats Author: lealone File: VarLengthValuesColumn.java License: Apache License 2.0 | 6 votes |
VarLengthValuesColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); variableWidthVector = (VariableWidthVector) valueVec; if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) { usingDictionary = true; // We didn't implement the fixed length optimization when a Parquet Dictionary is used; as there are // no data point about this use-case. Will also enable bulk processing by default since early data // profiling (for detecting the best processing strategy to use) is disabled when the column precision // is already set. bulkReaderState.columnPrecInfo.columnPrecisionType = ColumnPrecisionType.DT_PRECISION_IS_VARIABLE; bulkReaderState.columnPrecInfo.bulkProcess = true; } else { usingDictionary = false; } }
Example #6
Source Project: parquet-mr Author: apache File: DictionaryFilterTest.java License: Apache License 2.0 | 6 votes |
@SuppressWarnings("deprecation") private void testDictionaryEncodedColumnsV1() throws Exception { Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList( "binary_field", "single_value_field", "optional_single_value_field", "int32_field", "int64_field", "double_field", "float_field", "int96_field")); for (ColumnChunkMetaData column : ccmd) { String name = column.getPath().toDotString(); if (dictionaryEncodedColumns.contains(name)) { assertTrue("Column should be dictionary encoded: " + name, column.getEncodings().contains(Encoding.PLAIN_DICTIONARY)); assertFalse("Column should not have plain data pages" + name, column.getEncodings().contains(Encoding.PLAIN)); } else { assertTrue("Column should have plain encoding: " + name, column.getEncodings().contains(Encoding.PLAIN)); if (name.startsWith("fallback")) { assertTrue("Column should have some dictionary encoding: " + name, column.getEncodings().contains(Encoding.PLAIN_DICTIONARY)); } else { assertFalse("Column should have no dictionary encoding: " + name, column.getEncodings().contains(Encoding.PLAIN_DICTIONARY)); } } } }
Example #7
Source Project: parquet-mr Author: apache File: TestParquetWriterAppendBlocks.java License: Apache License 2.0 | 6 votes |
public void assertColumnsEquivalent(List<ColumnChunkMetaData> expected, List<ColumnChunkMetaData> actual) { Assert.assertEquals("Should have the expected columns", expected.size(), actual.size()); for (int i = 0; i < actual.size(); i += 1) { ColumnChunkMetaData current = actual.get(i); if (i != 0) { ColumnChunkMetaData previous = actual.get(i - 1); long expectedStart = previous.getStartingPos() + previous.getTotalSize(); Assert.assertEquals("Should start after the previous column", expectedStart, current.getStartingPos()); } assertColumnMetadataEquivalent(expected.get(i), current); } }
Example #8
Source Project: parquet-mr Author: apache File: DictionaryFilterTest.java License: Apache License 2.0 | 6 votes |
private void testDictionaryEncodedColumnsV2() throws Exception { Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList( "binary_field", "single_value_field", "optional_single_value_field", "fixed_field", "int32_field", "int64_field", "double_field", "float_field", "int96_field")); for (ColumnChunkMetaData column : ccmd) { EncodingStats encStats = column.getEncodingStats(); String name = column.getPath().toDotString(); if (dictionaryEncodedColumns.contains(name)) { assertTrue("Column should have dictionary pages: " + name, encStats.hasDictionaryPages()); assertTrue("Column should have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages()); assertFalse("Column should not have non-dictionary encoded pages: " + name, encStats.hasNonDictionaryEncodedPages()); } else { assertTrue("Column should have non-dictionary encoded pages: " + name, encStats.hasNonDictionaryEncodedPages()); if (name.startsWith("fallback")) { assertTrue("Column should have dictionary pages: " + name, encStats.hasDictionaryPages()); assertTrue("Column should have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages()); } else { assertFalse("Column should not have dictionary pages: " + name, encStats.hasDictionaryPages()); assertFalse("Column should not have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages()); } } } }
Example #9
Source Project: Bats Author: lealone File: VarLengthColumnReaders.java License: Apache License 2.0 | 5 votes |
VarDecimalColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarDecimalVector v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); this.varDecimalVector = v; this.mutator = v.getMutator(); }
Example #10
Source Project: Bats Author: lealone File: VarLengthColumnReaders.java License: Apache License 2.0 | 5 votes |
NullableVarDecimalColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableVarDecimalVector v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); nullableVarDecimalVector = v; this.mutator = v.getMutator(); }
Example #11
Source Project: Bats Author: lealone File: VarLengthColumnReaders.java License: Apache License 2.0 | 5 votes |
VarCharColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarCharVector v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); this.varCharVector = v; this.mutator = v.getMutator(); }
Example #12
Source Project: Bats Author: lealone File: VarLengthColumnReaders.java License: Apache License 2.0 | 5 votes |
VarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarBinaryVector v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); this.varBinaryVector = v; this.mutator = v.getMutator(); }
Example #13
Source Project: Bats Author: lealone File: VarLengthColumnReaders.java License: Apache License 2.0 | 5 votes |
NullableVarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableVarBinaryVector v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); this.nullableVarBinaryVector = v; this.mutator = v.getMutator(); }
Example #14
Source Project: parquet-mr Author: apache File: ParquetFileReader.java License: Apache License 2.0 | 5 votes |
/** * @param column * the column chunk which the offset index is to be returned for * @return the offset index for the specified column chunk or {@code null} if there is no index * @throws IOException * if any I/O error occurs during reading the file */ @Private public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException { IndexReference ref = column.getOffsetIndexReference(); if (ref == null) { return null; } f.seek(ref.getOffset()); return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f)); }
Example #15
Source Project: parquet-mr Author: apache File: TestInputFormat.java License: Apache License 2.0 | 5 votes |
public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) { BlockMetaData blockMetaData = new BlockMetaData(); ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"), PrimitiveTypeName.INT32, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)), stats, 100l, 100l, valueCount, 100l, 100l); blockMetaData.addColumn(column); blockMetaData.setTotalByteSize(200l); blockMetaData.setRowCount(valueCount); return blockMetaData; }
Example #16
Source Project: dremio-oss Author: dremio File: ColumnReaderFactory.java License: Apache License 2.0 | 5 votes |
static VarLengthValuesColumn<?> getReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v, SchemaElement schemaElement ) throws ExecutionSetupException { ConvertedType convertedType = schemaElement.getConverted_type(); switch (descriptor.getMaxDefinitionLevel()) { case 0: if (convertedType == null) { return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: return new VarLengthColumnReaders.Decimal28Column(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement); default: return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } default: if (convertedType == null) { return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: return new NullableDecimalColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement); default: return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } } }
Example #17
Source Project: parquet-mr Author: apache File: TestInputFormat.java License: Apache License 2.0 | 5 votes |
private BlockMetaData newBlock(long start, long compressedBlockSize) { BlockMetaData blockMetaData = new BlockMetaData(); long uncompressedSize = compressedBlockSize * 2;//assuming the compression ratio is 2 ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"), PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)), new BinaryStatistics(), start, 0l, 0l, compressedBlockSize, uncompressedSize); blockMetaData.addColumn(column); blockMetaData.setTotalByteSize(uncompressedSize); return blockMetaData; }
Example #18
Source Project: Bats Author: lealone File: PageReader.java License: Apache License 2.0 | 5 votes |
PageReader(org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentStatus, FileSystem fs, Path path, ColumnChunkMetaData columnChunkMetaData) throws ExecutionSetupException { this.parentColumnReader = parentStatus; allocatedDictionaryBuffers = new ArrayList<ByteBuf>(); codecFactory = parentColumnReader.parentReader.getCodecFactory(); this.stats = parentColumnReader.parentReader.parquetReaderStats; this.fileName = path.toString(); debugName = new StringBuilder() .append(this.parentColumnReader.parentReader.getFragmentContext().getFragIdString()) .append(":") .append(this.parentColumnReader.parentReader.getOperatorContext().getStats().getId() ) .append(this.parentColumnReader.columnChunkMetaData.toString() ) .toString(); try { inputStream = fs.open(path); BufferAllocator allocator = parentColumnReader.parentReader.getOperatorContext().getAllocator(); columnChunkMetaData.getTotalUncompressedSize(); useBufferedReader = parentColumnReader.parentReader.useBufferedReader; scanBufferSize = parentColumnReader.parentReader.bufferedReadSize; useFadvise = parentColumnReader.parentReader.useFadvise; enforceTotalSize = parentColumnReader.parentReader.enforceTotalSize; if (useBufferedReader) { this.dataReader = new BufferedDirectBufInputStream(inputStream, allocator, path.getName(), columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), scanBufferSize, enforceTotalSize, useFadvise); } else { this.dataReader = new DirectBufInputStream(inputStream, allocator, path.getName(), columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), enforceTotalSize, useFadvise); } } catch (IOException e) { throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: " + path.getName(), e); } }
Example #19
Source Project: Bats Author: lealone File: PageReader.java License: Apache License 2.0 | 5 votes |
protected void loadDictionaryIfExists(final org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentStatus, final ColumnChunkMetaData columnChunkMetaData, final DirectBufInputStream f) throws IOException { Stopwatch timer = Stopwatch.createUnstarted(); if (columnChunkMetaData.getDictionaryPageOffset() > 0) { long bytesToSkip = columnChunkMetaData.getDictionaryPageOffset() - dataReader.getPos(); while (bytesToSkip > 0) { long skipped = dataReader.skip(bytesToSkip); if (skipped > 0) { bytesToSkip -= skipped; } else { // no good way to handle this. Guava uses InputStream.available to check // if EOF is reached and because available is not reliable, // tries to read the rest of the data. DrillBuf skipBuf = dataReader.getNext((int) bytesToSkip); if (skipBuf != null) { skipBuf.release(); } else { throw new EOFException("End of File reachecd."); } } } long start=dataReader.getPos(); timer.start(); final PageHeader pageHeader = Util.readPageHeader(f); long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS); long pageHeaderBytes=dataReader.getPos()-start; this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes); assert pageHeader.type == PageType.DICTIONARY_PAGE; readDictionaryPage(pageHeader, parentStatus); } }
Example #20
Source Project: parquet-mr Author: apache File: DictionaryPageReader.java License: Apache License 2.0 | 5 votes |
private boolean hasDictionaryPage(ColumnChunkMetaData column) { EncodingStats stats = column.getEncodingStats(); if (stats != null) { // ensure there is a dictionary page and that it is used to encode data pages return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages(); } Set<Encoding> encodings = column.getEncodings(); return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY)); }
Example #21
Source Project: parquet-mr Author: apache File: BloomFilterImpl.java License: Apache License 2.0 | 5 votes |
@Override public <T extends Comparable<T>> Boolean visit(Operators.Eq<T> eq) { T value = eq.getValue(); if (value == null) { // the bloom filter bitset contains only non-null values so isn't helpful. this // could check the column stats, but the StatisticsFilter is responsible return BLOCK_MIGHT_MATCH; } Operators.Column<T> filterColumn = eq.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column isn't in this file so all values are null, but the value // must be non-null because of the above check. return BLOCK_CANNOT_MATCH; } try { BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(meta); if (bloomFilter != null && !bloomFilter.findHash(bloomFilter.hash(value))) { return BLOCK_CANNOT_MATCH; } } catch (RuntimeException e) { LOG.warn(e.getMessage()); return BLOCK_MIGHT_MATCH; } return BLOCK_MIGHT_MATCH; }
Example #22
Source Project: parquet-mr Author: apache File: DictionaryFilter.java License: Apache License 2.0 | 5 votes |
@Override public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never greater than a // value. for all x, null is never > x. return BLOCK_CANNOT_MATCH; } // if the chunk has non-dictionary pages, don't bother decoding the // dictionary because the row group can't be eliminated. if (hasNonDictionaryPages(meta)) { return BLOCK_MIGHT_MATCH; } T value = gt.getValue(); try { Set<T> dictSet = expandDictionary(meta); if (dictSet == null) { return BLOCK_MIGHT_MATCH; } Comparator<T> comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { if (comparator.compare(value, entry) < 0) { return BLOCK_MIGHT_MATCH; } } return BLOCK_CANNOT_MATCH; } catch (IOException e) { LOG.warn("Failed to process dictionary for filter evaluation.", e); } return BLOCK_MIGHT_MATCH; }
Example #23
Source Project: parquet-mr Author: apache File: ColumnIndexStoreImpl.java License: Apache License 2.0 | 5 votes |
private ColumnIndexStoreImpl(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) { // TODO[GS]: Offset index for every paths will be required; pre-read the consecutive ones at once? // TODO[GS]: Pre-read column index based on filter? this.reader = reader; Map<ColumnPath, IndexStore> store = new HashMap<>(); for (ColumnChunkMetaData column : block.getColumns()) { ColumnPath path = column.getPath(); if (paths.contains(path)) { store.put(path, new IndexStoreImpl(column)); } } this.store = store; }
Example #24
Source Project: Bats Author: lealone File: ParquetSchema.java License: Apache License 2.0 | 5 votes |
Map<String, Integer> buildChunkMap(BlockMetaData rowGroupMetadata) { // the column chunk meta-data is not guaranteed to be in the same order as the columns in the schema // a map is constructed for fast access to the correct columnChunkMetadata to correspond // to an element in the schema Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>(); int colChunkIndex = 0; for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) { columnChunkMetadataPositionsInList.put(Arrays.toString(colChunk.getPath().toArray()), colChunkIndex); colChunkIndex++; } return columnChunkMetadataPositionsInList; }
Example #25
Source Project: dremio-oss Author: dremio File: NullableFixedByteAlignedReaders.java License: Apache License 2.0 | 5 votes |
CorruptionDetectingNullableDateReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, DateMilliVector v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); dateVector = v; }
Example #26
Source Project: parquet-mr Author: apache File: ColumnSizeCommand.java License: Apache License 2.0 | 5 votes |
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException { Map<String, Long> colSizes = new HashMap<>(); ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER); for (BlockMetaData block : pmd.getBlocks()) { for (ColumnChunkMetaData column : block.getColumns()) { String colName = column.getPath().toDotString(); colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L)); } } return colSizes; }
Example #27
Source Project: parquet-mr Author: apache File: CompressionConveterTest.java License: Apache License 2.0 | 5 votes |
private List<Long> getOffsets(TransParquetFileReader reader, ColumnChunkMetaData chunk) throws IOException { List<Long> offsets = new ArrayList<>(); reader.setStreamPosition(chunk.getStartingPos()); long readValues = 0; long totalChunkValues = chunk.getValueCount(); while (readValues < totalChunkValues) { long curOffset = reader.getPos(); PageHeader pageHeader = reader.readPageHeader(); switch (pageHeader.type) { case DICTIONARY_PAGE: compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader); break; case DATA_PAGE: DataPageHeader headerV1 = pageHeader.data_page_header; offsets.add(curOffset); compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader); readValues += headerV1.getNum_values(); break; case DATA_PAGE_V2: DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2; offsets.add(curOffset); int rlLength = headerV2.getRepetition_levels_byte_length(); compressionConverter.readBlock(rlLength, reader); int dlLength = headerV2.getDefinition_levels_byte_length(); compressionConverter.readBlock(dlLength, reader); int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength; compressionConverter.readBlock(payLoadLength, reader); readValues += headerV2.getNum_values(); break; default: throw new IOException("Not recognized page type"); } } return offsets; }
Example #28
Source Project: parquet-mr Author: apache File: MetadataUtils.java License: Apache License 2.0 | 5 votes |
private static void showColumnChunkDetails(PrettyPrintWriter out, Map<String,Object> current, int depth) { for (Map.Entry<String,Object> entry : current.entrySet()) { String name = Strings.repeat(".", depth) + entry.getKey(); Object value = entry.getValue(); if (value instanceof Map) { out.println(name + ": "); showColumnChunkDetails(out, (Map<String,Object>)value, depth + 1); } else { out.print(name + ": "); showDetails(out, (ColumnChunkMetaData)value, false); } } }
Example #29
Source Project: Bats Author: lealone File: ColumnReaderFactory.java License: Apache License 2.0 | 5 votes |
static VarLengthValuesColumn<?> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v, SchemaElement schemaElement ) throws ExecutionSetupException { ConvertedType convertedType = schemaElement.getConverted_type(); switch (descriptor.getMaxDefinitionLevel()) { case 0: if (convertedType == null) { return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: case ENUM: return new VarLengthColumnReaders.VarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: if (v instanceof VarDecimalVector) { return new VarLengthColumnReaders.VarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarDecimalVector) v, schemaElement); } default: return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } default: if (convertedType == null) { return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: case ENUM: return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement); case DECIMAL: if (v instanceof NullableVarDecimalVector) { return new VarLengthColumnReaders.NullableVarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) v, schemaElement); } default: return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement); } } }
Example #30
Source Project: dremio-oss Author: dremio File: ColumnChunkIncReadStore.java License: Apache License 2.0 | 5 votes |
public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, BulkInputStream in) throws IOException { this.metaData = metaData; this.columnDescriptor = columnDescriptor; this.size = metaData.getTotalSize(); this.fileOffset = metaData.getStartingPos(); this.in = in; this.decompressor = codecFactory.getDecompressor(metaData.getCodec()); }