Java Code Examples for org.apache.parquet.column.ColumnDescriptor#getPrimitiveType()

The following examples show how to use org.apache.parquet.column.ColumnDescriptor#getPrimitiveType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * start a column inside a block
 * @param descriptor the column descriptor
 * @param valueCount the value count in this column
 * @param compressionCodecName a compression codec name
 * @throws IOException if there is an error while writing
 */
public void startColumn(ColumnDescriptor descriptor,
                        long valueCount,
                        CompressionCodecName compressionCodecName) throws IOException {
  state = state.startColumn();
  encodingStatsBuilder.clear();
  currentEncodings = new HashSet<Encoding>();
  currentChunkPath = ColumnPath.get(descriptor.getPath());
  currentChunkType = descriptor.getPrimitiveType();
  currentChunkCodec = compressionCodecName;
  currentChunkValueCount = valueCount;
  currentChunkFirstDataPage = out.getPos();
  compressedLength = 0;
  uncompressedLength = 0;
  // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one
  currentStatistics = null;

  columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength);
  offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
  firstPageOffset = -1;
}

Example 2

Source File: ArrowVectorAccessors.java From iceberg with Apache License 2.0

5 votes

static ArrowVectorAccessor getVectorAccessor(VectorHolder holder) {
  Dictionary dictionary = holder.dictionary();
  boolean isVectorDictEncoded = holder.isDictionaryEncoded();
  ColumnDescriptor desc = holder.descriptor();
  FieldVector vector = holder.vector();
  PrimitiveType primitive = desc.getPrimitiveType();
  if (isVectorDictEncoded) {
    return getDictionaryVectorAccessor(dictionary, desc, vector, primitive);
  } else {
    return getPlainVectorAccessor(vector);
  }
}

Example 3

Source File: ColumnIndexValidator.java From parquet-mr with Apache License 2.0

4 votes

public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
  List<ContractViolation> violations = new ArrayList<>();
  try (ParquetFileReader reader = ParquetFileReader.open(file)) {
    FileMetaData meta = reader.getFooter().getFileMetaData();
    MessageType schema = meta.getSchema();
    List<ColumnDescriptor> columns = schema.getColumns();

    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    int rowGroupNumber = 0;
    PageReadStore rowGroup = reader.readNextRowGroup();
    while (rowGroup != null) {
      ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup,
          new DummyRecordConverter(schema).getRootConverter(), schema, null);
      List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
      assert (columnChunks.size() == columns.size());
      for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
        ColumnDescriptor column = columns.get(columnNumber);
        ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
        ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
        if (columnIndex == null) {
          continue;
        }
        ColumnPath columnPath = columnChunk.getPath();
        OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
        List<ByteBuffer> minValues = columnIndex.getMinValues();
        List<ByteBuffer> maxValues = columnIndex.getMaxValues();
        BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
        List<Long> nullCounts = columnIndex.getNullCounts();
        List<Boolean> nullPages = columnIndex.getNullPages();
        long rowNumber = 0;
        ColumnReader columnReader = columnReadStore.getColumnReader(column);
        ByteBuffer prevMinValue = null;
        ByteBuffer prevMaxValue = null;
        for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
          boolean isNullPage = nullPages.get(pageNumber);
          ByteBuffer minValue = minValues.get(pageNumber);
          ByteBuffer maxValue = maxValues.get(pageNumber);
          PageValidator pageValidator = new PageValidator(
              column.getPrimitiveType(),
              rowGroupNumber, columnNumber, columnPath, pageNumber,
              violations, columnReader,
              minValue,
              maxValue,
              prevMinValue,
              prevMaxValue,
              boundaryOrder,
              nullCounts.get(pageNumber),
              isNullPage);
          if (!isNullPage) {
            prevMinValue = minValue;
            prevMaxValue = maxValue;
          }
          long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
          while (rowNumber <= lastRowNumberInPage) {
            pageValidator.validateValuesBelongingToRow();
            ++rowNumber;
          }
          pageValidator.finishPage();
        }
      }
      rowGroup = reader.readNextRowGroup();
      rowGroupNumber++;
    }
  }
  return violations;
}