Java Code Examples for org.apache.parquet.hadoop.metadata.ColumnChunkMetaData

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnChunkMetaData. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: parquet-mr   Author: apache   File: PrintFooter.java    License: Apache License 2.0 6 votes vote down vote up
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
Example #2
Source Project: parquet-mr   Author: apache   File: ColumnIndexFilterUtils.java    License: Apache License 2.0 6 votes vote down vote up
static List<OffsetRange> calculateOffsetRanges(OffsetIndex offsetIndex, ColumnChunkMetaData cm,
    long firstPageOffset) {
  List<OffsetRange> ranges = new ArrayList<>();
  int n = offsetIndex.getPageCount();
  if (n > 0) {
    OffsetRange currentRange = null;

    // Add a range for the dictionary page if required
    long rowGroupOffset = cm.getStartingPos();
    if (rowGroupOffset < firstPageOffset) {
      currentRange = new OffsetRange(rowGroupOffset, (int) (firstPageOffset - rowGroupOffset));
      ranges.add(currentRange);
    }

    for (int i = 0; i < n; ++i) {
      long offset = offsetIndex.getOffset(i);
      int length = offsetIndex.getCompressedPageSize(i);
      if (currentRange == null || !currentRange.extend(offset, length)) {
        currentRange = new OffsetRange(offset, length);
        ranges.add(currentRange);
      }
    }
  }
  return ranges;
}
 
Example #3
Source Project: iceberg   Author: Netflix   File: ParquetMetricsRowGroupFilter.java    License: Apache License 2.0 6 votes vote down vote up
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visit(expr, this);
}
 
Example #4
Source Project: parquet-mr   Author: apache   File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}
 
Example #5
Source Project: Bats   Author: lealone   File: VarLengthValuesColumn.java    License: Apache License 2.0 6 votes vote down vote up
VarLengthValuesColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                      ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
                      SchemaElement schemaElement) throws ExecutionSetupException {

  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  variableWidthVector = (VariableWidthVector) valueVec;

  if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
    usingDictionary = true;
    // We didn't implement the fixed length optimization when a Parquet Dictionary is used; as there are
    // no data point about this use-case. Will also enable bulk processing by default since early data
    // profiling (for detecting the best processing strategy to use) is disabled when the column precision
    // is already set.
    bulkReaderState.columnPrecInfo.columnPrecisionType = ColumnPrecisionType.DT_PRECISION_IS_VARIABLE;
    bulkReaderState.columnPrecInfo.bulkProcess         = true;
  }
  else {
    usingDictionary = false;
  }
}
 
Example #6
Source Project: parquet-mr   Author: apache   File: DictionaryFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
private void testDictionaryEncodedColumnsV1() throws Exception {
  Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList(
      "binary_field", "single_value_field", "optional_single_value_field", "int32_field", "int64_field",
      "double_field", "float_field", "int96_field"));
  for (ColumnChunkMetaData column : ccmd) {
    String name = column.getPath().toDotString();
    if (dictionaryEncodedColumns.contains(name)) {
      assertTrue("Column should be dictionary encoded: " + name,
          column.getEncodings().contains(Encoding.PLAIN_DICTIONARY));
      assertFalse("Column should not have plain data pages" + name,
          column.getEncodings().contains(Encoding.PLAIN));
    } else {
      assertTrue("Column should have plain encoding: " + name,
          column.getEncodings().contains(Encoding.PLAIN));
      if (name.startsWith("fallback")) {
        assertTrue("Column should have some dictionary encoding: " + name,
            column.getEncodings().contains(Encoding.PLAIN_DICTIONARY));
      } else {
        assertFalse("Column should have no dictionary encoding: " + name,
            column.getEncodings().contains(Encoding.PLAIN_DICTIONARY));
      }
    }
  }
}
 
Example #7
Source Project: parquet-mr   Author: apache   File: TestParquetWriterAppendBlocks.java    License: Apache License 2.0 6 votes vote down vote up
public void assertColumnsEquivalent(List<ColumnChunkMetaData> expected,
                                    List<ColumnChunkMetaData> actual) {
  Assert.assertEquals("Should have the expected columns",
      expected.size(), actual.size());
  for (int i = 0; i < actual.size(); i += 1) {
    ColumnChunkMetaData current = actual.get(i);
    if (i != 0) {
      ColumnChunkMetaData previous = actual.get(i - 1);
      long expectedStart = previous.getStartingPos() + previous.getTotalSize();
      Assert.assertEquals("Should start after the previous column",
          expectedStart, current.getStartingPos());
    }

    assertColumnMetadataEquivalent(expected.get(i), current);
  }
}
 
Example #8
Source Project: parquet-mr   Author: apache   File: DictionaryFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
private void testDictionaryEncodedColumnsV2() throws Exception {
  Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList(
      "binary_field", "single_value_field", "optional_single_value_field", "fixed_field", "int32_field",
      "int64_field", "double_field", "float_field", "int96_field"));
  for (ColumnChunkMetaData column : ccmd) {
    EncodingStats encStats = column.getEncodingStats();
    String name = column.getPath().toDotString();
    if (dictionaryEncodedColumns.contains(name)) {
      assertTrue("Column should have dictionary pages: " + name, encStats.hasDictionaryPages());
      assertTrue("Column should have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      assertFalse("Column should not have non-dictionary encoded pages: " + name,
          encStats.hasNonDictionaryEncodedPages());
    } else {
      assertTrue("Column should have non-dictionary encoded pages: " + name,
          encStats.hasNonDictionaryEncodedPages());
      if (name.startsWith("fallback")) {
        assertTrue("Column should have dictionary pages: " + name, encStats.hasDictionaryPages());
        assertTrue("Column should have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      } else {
        assertFalse("Column should not have dictionary pages: " + name, encStats.hasDictionaryPages());
        assertFalse("Column should not have dictionary encoded pages: " + name, encStats.hasDictionaryEncodedPages());
      }
    }
  }
}
 
Example #9
Source Project: Bats   Author: lealone   File: VarLengthColumnReaders.java    License: Apache License 2.0 5 votes vote down vote up
VarDecimalColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarDecimalVector v,
                SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  this.varDecimalVector = v;
  this.mutator = v.getMutator();
}
 
Example #10
Source Project: Bats   Author: lealone   File: VarLengthColumnReaders.java    License: Apache License 2.0 5 votes vote down vote up
NullableVarDecimalColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                        ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableVarDecimalVector v,
                        SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  nullableVarDecimalVector = v;
  this.mutator = v.getMutator();
}
 
Example #11
Source Project: Bats   Author: lealone   File: VarLengthColumnReaders.java    License: Apache License 2.0 5 votes vote down vote up
VarCharColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
              ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarCharVector v,
              SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  this.varCharVector = v;
  this.mutator       = v.getMutator();
}
 
Example #12
Source Project: Bats   Author: lealone   File: VarLengthColumnReaders.java    License: Apache License 2.0 5 votes vote down vote up
VarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarBinaryVector v,
                SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);

  this.varBinaryVector = v;
  this.mutator         = v.getMutator();
}
 
Example #13
Source Project: Bats   Author: lealone   File: VarLengthColumnReaders.java    License: Apache License 2.0 5 votes vote down vote up
NullableVarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                        ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableVarBinaryVector v,
                        SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  this.nullableVarBinaryVector = v;
  this.mutator                 = v.getMutator();
}
 
Example #14
Source Project: parquet-mr   Author: apache   File: ParquetFileReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param column
 *          the column chunk which the offset index is to be returned for
 * @return the offset index for the specified column chunk or {@code null} if there is no index
 * @throws IOException
 *           if any I/O error occurs during reading the file
 */
@Private
public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException {
  IndexReference ref = column.getOffsetIndexReference();
  if (ref == null) {
    return null;
  }
  f.seek(ref.getOffset());
  return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f));
}
 
Example #15
Source Project: parquet-mr   Author: apache   File: TestInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
  BlockMetaData blockMetaData = new BlockMetaData();

  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      100l, 100l, valueCount, 100l, 100l);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(200l);
  blockMetaData.setRowCount(valueCount);
  return blockMetaData;
}
 
Example #16
Source Project: dremio-oss   Author: dremio   File: ColumnReaderFactory.java    License: Apache License 2.0 5 votes vote down vote up
static VarLengthValuesColumn<?> getReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                                          ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
                                          SchemaElement schemaElement
) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();
  switch (descriptor.getMaxDefinitionLevel()) {
    case 0:
      if (convertedType == null) {
        return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
      switch (convertedType) {
        case UTF8:
          return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          return new VarLengthColumnReaders.Decimal28Column(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
        default:
          return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
    default:
      if (convertedType == null) {
        return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }

      switch (convertedType) {
        case UTF8:
          return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          return new NullableDecimalColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
        default:
          return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
  }
}
 
Example #17
Source Project: parquet-mr   Author: apache   File: TestInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
private BlockMetaData newBlock(long start, long compressedBlockSize) {
  BlockMetaData blockMetaData = new BlockMetaData();
  long uncompressedSize = compressedBlockSize * 2;//assuming the compression ratio is 2
  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
                                                       PrimitiveTypeName.BINARY,
                                                       CompressionCodecName.GZIP,
                                                       new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
                                                       new BinaryStatistics(),
                                                       start, 0l, 0l, compressedBlockSize, uncompressedSize);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(uncompressedSize);
  return blockMetaData;
}
 
Example #18
Source Project: Bats   Author: lealone   File: PageReader.java    License: Apache License 2.0 5 votes vote down vote up
PageReader(org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentStatus, FileSystem fs, Path path, ColumnChunkMetaData columnChunkMetaData)
  throws ExecutionSetupException {
  this.parentColumnReader = parentStatus;
  allocatedDictionaryBuffers = new ArrayList<ByteBuf>();
  codecFactory = parentColumnReader.parentReader.getCodecFactory();
  this.stats = parentColumnReader.parentReader.parquetReaderStats;
  this.fileName = path.toString();
  debugName = new StringBuilder()
     .append(this.parentColumnReader.parentReader.getFragmentContext().getFragIdString())
     .append(":")
     .append(this.parentColumnReader.parentReader.getOperatorContext().getStats().getId() )
     .append(this.parentColumnReader.columnChunkMetaData.toString() )
     .toString();
  try {
    inputStream  = fs.open(path);
    BufferAllocator allocator =  parentColumnReader.parentReader.getOperatorContext().getAllocator();
    columnChunkMetaData.getTotalUncompressedSize();
    useBufferedReader  = parentColumnReader.parentReader.useBufferedReader;
    scanBufferSize = parentColumnReader.parentReader.bufferedReadSize;
    useFadvise = parentColumnReader.parentReader.useFadvise;
    enforceTotalSize = parentColumnReader.parentReader.enforceTotalSize;
    if (useBufferedReader) {
      this.dataReader = new BufferedDirectBufInputStream(inputStream, allocator, path.getName(),
          columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), scanBufferSize,
          enforceTotalSize, useFadvise);
    } else {
      this.dataReader = new DirectBufInputStream(inputStream, allocator, path.getName(),
          columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), enforceTotalSize,
          useFadvise);
    }
  } catch (IOException e) {
    throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
        + path.getName(), e);
  }

}
 
Example #19
Source Project: Bats   Author: lealone   File: PageReader.java    License: Apache License 2.0 5 votes vote down vote up
protected void loadDictionaryIfExists(final org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentStatus,
    final ColumnChunkMetaData columnChunkMetaData, final DirectBufInputStream f) throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  if (columnChunkMetaData.getDictionaryPageOffset() > 0) {
    long bytesToSkip = columnChunkMetaData.getDictionaryPageOffset() - dataReader.getPos();
    while (bytesToSkip > 0) {
      long skipped = dataReader.skip(bytesToSkip);
      if (skipped > 0) {
        bytesToSkip -= skipped;
      } else {
        // no good way to handle this. Guava uses InputStream.available to check
        // if EOF is reached and because available is not reliable,
        // tries to read the rest of the data.
        DrillBuf skipBuf = dataReader.getNext((int) bytesToSkip);
        if (skipBuf != null) {
          skipBuf.release();
        } else {
          throw new EOFException("End of File reachecd.");
        }
      }
    }

    long start=dataReader.getPos();
    timer.start();
    final PageHeader pageHeader = Util.readPageHeader(f);
    long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
    long pageHeaderBytes=dataReader.getPos()-start;
    this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
    assert pageHeader.type == PageType.DICTIONARY_PAGE;
    readDictionaryPage(pageHeader, parentStatus);
  }
}
 
Example #20
Source Project: parquet-mr   Author: apache   File: DictionaryPageReader.java    License: Apache License 2.0 5 votes vote down vote up
private boolean hasDictionaryPage(ColumnChunkMetaData column) {
  EncodingStats stats = column.getEncodingStats();
  if (stats != null) {
    // ensure there is a dictionary page and that it is used to encode data pages
    return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
  }

  Set<Encoding> encodings = column.getEncodings();
  return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY));
}
 
Example #21
Source Project: parquet-mr   Author: apache   File: BloomFilterImpl.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <T extends Comparable<T>> Boolean visit(Operators.Eq<T> eq) {
  T value = eq.getValue();

  if (value == null) {
    // the bloom filter bitset contains only non-null values so isn't helpful. this
    // could check the column stats, but the StatisticsFilter is responsible
    return BLOCK_MIGHT_MATCH;
  }

  Operators.Column<T> filterColumn = eq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
  if (meta == null) {
    // the column isn't in this file so all values are null, but the value
    // must be non-null because of the above check.
    return BLOCK_CANNOT_MATCH;
  }

  try {
    BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(meta);
    if (bloomFilter != null && !bloomFilter.findHash(bloomFilter.hash(value))) {
      return BLOCK_CANNOT_MATCH;
    }
  } catch (RuntimeException e) {
    LOG.warn(e.getMessage());
    return BLOCK_MIGHT_MATCH;
  }

  return BLOCK_MIGHT_MATCH;
}
 
Example #22
Source Project: parquet-mr   Author: apache   File: DictionaryFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
  Column<T> filterColumn = gt.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never greater than a
    // value. for all x, null is never > x.
    return BLOCK_CANNOT_MATCH;
  }

  // if the chunk has non-dictionary pages, don't bother decoding the
  // dictionary because the row group can't be eliminated.
  if (hasNonDictionaryPages(meta)) {
    return BLOCK_MIGHT_MATCH;
  }

  T value = gt.getValue();

  try {
    Set<T> dictSet = expandDictionary(meta);
    if (dictSet == null) {
      return BLOCK_MIGHT_MATCH;
    }

    Comparator<T> comparator = meta.getPrimitiveType().comparator();
    for (T entry : dictSet) {
      if (comparator.compare(value, entry) < 0) {
        return BLOCK_MIGHT_MATCH;
      }
    }

    return BLOCK_CANNOT_MATCH;
  } catch (IOException e) {
    LOG.warn("Failed to process dictionary for filter evaluation.", e);
  }

  return BLOCK_MIGHT_MATCH;
}
 
Example #23
Source Project: parquet-mr   Author: apache   File: ColumnIndexStoreImpl.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnIndexStoreImpl(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) {
  // TODO[GS]: Offset index for every paths will be required; pre-read the consecutive ones at once?
  // TODO[GS]: Pre-read column index based on filter?
  this.reader = reader;
  Map<ColumnPath, IndexStore> store = new HashMap<>();
  for (ColumnChunkMetaData column : block.getColumns()) {
    ColumnPath path = column.getPath();
    if (paths.contains(path)) {
      store.put(path, new IndexStoreImpl(column));
    }
  }
  this.store = store;
}
 
Example #24
Source Project: Bats   Author: lealone   File: ParquetSchema.java    License: Apache License 2.0 5 votes vote down vote up
Map<String, Integer> buildChunkMap(BlockMetaData rowGroupMetadata) {
  // the column chunk meta-data is not guaranteed to be in the same order as the columns in the schema
  // a map is constructed for fast access to the correct columnChunkMetadata to correspond
  // to an element in the schema
  Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>();

  int colChunkIndex = 0;
  for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) {
    columnChunkMetadataPositionsInList.put(Arrays.toString(colChunk.getPath().toArray()), colChunkIndex);
    colChunkIndex++;
  }
  return columnChunkMetadataPositionsInList;
}
 
Example #25
Source Project: dremio-oss   Author: dremio   File: NullableFixedByteAlignedReaders.java    License: Apache License 2.0 5 votes vote down vote up
CorruptionDetectingNullableDateReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize,
    ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
    boolean fixedLength, DateMilliVector v, SchemaElement schemaElement)
        throws ExecutionSetupException {
  super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  dateVector = v;
}
 
Example #26
Source Project: parquet-mr   Author: apache   File: ColumnSizeCommand.java    License: Apache License 2.0 5 votes vote down vote up
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}
 
Example #27
Source Project: parquet-mr   Author: apache   File: CompressionConveterTest.java    License: Apache License 2.0 5 votes vote down vote up
private List<Long> getOffsets(TransParquetFileReader reader, ColumnChunkMetaData chunk) throws IOException {
  List<Long> offsets = new ArrayList<>();
  reader.setStreamPosition(chunk.getStartingPos());
  long readValues = 0;
  long totalChunkValues = chunk.getValueCount();
  while (readValues < totalChunkValues) {
    long curOffset = reader.getPos();
    PageHeader pageHeader = reader.readPageHeader();
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader);
        break;
      case DATA_PAGE:
        DataPageHeader headerV1 = pageHeader.data_page_header;
        offsets.add(curOffset);
        compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader);
        readValues += headerV1.getNum_values();
        break;
      case DATA_PAGE_V2:
        DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
        offsets.add(curOffset);
        int rlLength = headerV2.getRepetition_levels_byte_length();
        compressionConverter.readBlock(rlLength, reader);
        int dlLength = headerV2.getDefinition_levels_byte_length();
        compressionConverter.readBlock(dlLength, reader);
        int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
        compressionConverter.readBlock(payLoadLength, reader);
        readValues += headerV2.getNum_values();
        break;
      default:
        throw new IOException("Not recognized page type");
    }
  }
  return offsets;
}
 
Example #28
Source Project: parquet-mr   Author: apache   File: MetadataUtils.java    License: Apache License 2.0 5 votes vote down vote up
private static void showColumnChunkDetails(PrettyPrintWriter out, Map<String,Object> current, int depth) {
  for (Map.Entry<String,Object> entry : current.entrySet()) {
    String name = Strings.repeat(".", depth) + entry.getKey();
    Object value = entry.getValue();

    if (value instanceof Map) {
      out.println(name + ": ");
      showColumnChunkDetails(out, (Map<String,Object>)value, depth + 1);
    } else {
      out.print(name + ": ");
      showDetails(out, (ColumnChunkMetaData)value, false);
    }
  }
}
 
Example #29
Source Project: Bats   Author: lealone   File: ColumnReaderFactory.java    License: Apache License 2.0 5 votes vote down vote up
static VarLengthValuesColumn<?> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                                        ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
                                        SchemaElement schemaElement
) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();
  switch (descriptor.getMaxDefinitionLevel()) {
    case 0:
      if (convertedType == null) {
        return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
      switch (convertedType) {
        case UTF8:
        case ENUM:
          return new VarLengthColumnReaders.VarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          if (v instanceof VarDecimalVector) {
            return new VarLengthColumnReaders.VarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarDecimalVector) v, schemaElement);
          }
        default:
          return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
    default:
      if (convertedType == null) {
        return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
      }

      switch (convertedType) {
        case UTF8:
        case ENUM:
          return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement);
        case DECIMAL:
          if (v instanceof NullableVarDecimalVector) {
            return new VarLengthColumnReaders.NullableVarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) v, schemaElement);
          }
        default:
          return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
      }
  }
}
 
Example #30
Source Project: dremio-oss   Author: dremio   File: ColumnChunkIncReadStore.java    License: Apache License 2.0 5 votes vote down vote up
public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, BulkInputStream in) throws IOException {
  this.metaData = metaData;
  this.columnDescriptor = columnDescriptor;
  this.size = metaData.getTotalSize();
  this.fileOffset = metaData.getStartingPos();
  this.in = in;
  this.decompressor = codecFactory.getDecompressor(metaData.getCodec());
}