Java Code Examples for org.apache.parquet.column.ColumnDescriptor#getType()

The following examples show how to use org.apache.parquet.column.ColumnDescriptor#getType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DefaultV2ValuesWriterFactory.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public ValuesWriter newValuesWriter(ColumnDescriptor descriptor) {
  switch (descriptor.getType()) {
    case BOOLEAN:
      return getBooleanValuesWriter();
    case FIXED_LEN_BYTE_ARRAY:
      return getFixedLenByteArrayValuesWriter(descriptor);
    case BINARY:
      return getBinaryValuesWriter(descriptor);
    case INT32:
      return getInt32ValuesWriter(descriptor);
    case INT64:
      return getInt64ValuesWriter(descriptor);
    case INT96:
      return getInt96ValuesWriter(descriptor);
    case DOUBLE:
      return getDoubleValuesWriter(descriptor);
    case FLOAT:
      return getFloatValuesWriter(descriptor);
    default:
      throw new IllegalArgumentException("Unknown type " + descriptor.getType());
  }
}
 
Example 2
Source File: DefaultV1ValuesWriterFactory.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public ValuesWriter newValuesWriter(ColumnDescriptor descriptor) {
  switch (descriptor.getType()) {
    case BOOLEAN:
      return getBooleanValuesWriter();
    case FIXED_LEN_BYTE_ARRAY:
      return getFixedLenByteArrayValuesWriter(descriptor);
    case BINARY:
      return getBinaryValuesWriter(descriptor);
    case INT32:
      return getInt32ValuesWriter(descriptor);
    case INT64:
      return getInt64ValuesWriter(descriptor);
    case INT96:
      return getInt96ValuesWriter(descriptor);
    case DOUBLE:
      return getDoubleValuesWriter(descriptor);
    case FLOAT:
      return getFloatValuesWriter(descriptor);
    default:
      throw new IllegalArgumentException("Unknown type " + descriptor.getType());
  }
}
 
Example 3
Source File: DefaultValuesWriterFactory.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
static DictionaryValuesWriter dictionaryWriter(ColumnDescriptor path, ParquetProperties properties, Encoding dictPageEncoding, Encoding dataPageEncoding) {
  switch (path.getType()) {
    case BOOLEAN:
      throw new IllegalArgumentException("no dictionary encoding for BOOLEAN");
    case BINARY:
      return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT32:
      return new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT64:
      return new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT96:
      return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), 12, dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case DOUBLE:
      return new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case FLOAT:
      return new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case FIXED_LEN_BYTE_ARRAY:
      return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), path.getTypeLength(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    default:
      throw new IllegalArgumentException("Unknown type " + path.getType());
  }
}
 
Example 4
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static void printDictionary(ColumnDescriptor columnDescriptor, Dictionary localDictionary) {
  System.out.println("Dictionary for column " + columnDescriptor.toString());
  for (int i = 0; i < localDictionary.getMaxId(); ++i) {
    switch (columnDescriptor.getType()) {
      case INT32:
        System.out.println(format("%d: %d", i, localDictionary.decodeToInt(i)));
        break;
      case INT64:
        System.out.println(format("%d: %d", i, localDictionary.decodeToLong(i)));
        break;
      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        System.out.println(format("%d: %s", i, new String(localDictionary.decodeToBinary(i).getBytesUnsafe())));
        break;
      case FLOAT:
        System.out.println(format("%d: %f", i, localDictionary.decodeToFloat(i)));
        break;
      case DOUBLE:
        System.out.println(format("%d: %f", i, localDictionary.decodeToDouble(i)));
        break;
      case BOOLEAN:
        System.out.println(format("%d: %b", i, localDictionary.decodeToBoolean(i)));
        break;
      default:
        break;
    }
  }
}
 
Example 5
Source File: DeprecatedParquetVectorizedReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Returns data type length for a given {@see ColumnDescriptor} and it's corresponding
 * {@see SchemaElement}. Neither is enough information alone as the max
 * repetition level (indicating if it is an array type) is in the ColumnDescriptor and
 * the length of a fixed width field is stored at the schema level.
 *
 * @return the length if fixed width, else -1
 */
private int getDataTypeLength(ColumnDescriptor column, SchemaElement se) {
  if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
    if (column.getMaxRepetitionLevel() > 0) {
      return -1;
    }
    if (column.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
      return se.getType_length() * 8;
    } else {
      return getTypeLengthInBits(column.getType());
    }
  } else {
    return -1;
  }
}
 
Example 6
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, ColumnDescriptor desc) {
  String path = Joiner.on(".").skipNulls().join(desc.getPath());
  PrimitiveTypeName type = desc.getType();
  int defl = desc.getMaxDefinitionLevel();
  int repl = desc.getMaxRepetitionLevel();

  out.format("column desc: %s T:%s R:%d D:%d%n", path, type, repl, defl);
}
 
Example 7
Source File: ColumnReaderFactory.java    From Bats with Apache License 2.0 4 votes vote down vote up
public static NullableColumnReader<?> getNullableColumnReader(ParquetRecordReader parentReader,
                                                           ColumnDescriptor columnDescriptor,
                                                           ColumnChunkMetaData columnChunkMetaData,
                                                           boolean fixedLength,
                                                           ValueVector valueVec,
                                                           SchemaElement schemaElement) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();

  if (! columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
    if (columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.INT96) {
       // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
      if (parentReader.getFragmentContext().getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
        return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableTimeStampVector) valueVec, schemaElement);
      } else {
        return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableVarBinaryVector) valueVec, schemaElement);
      }
    } else if (convertedType == ConvertedType.DECIMAL) {
      // NullableVarDecimalVector allows storing of values with different width,
      // so every time when the value is added, offset vector should be updated.
      // Therefore NullableVarDecimalReader is used here instead of NullableFixedByteAlignedReader.
      return new NullableFixedByteAlignedReaders.NullableVarDecimalReader(parentReader,
          columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
    } else {
      return new NullableFixedByteAlignedReaders.NullableFixedByteAlignedReader<>(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, valueVec, schemaElement);
    }
  } else {
    switch (columnDescriptor.getType()) {
      case INT32:
        if (convertedType == null) {
          return new NullableFixedByteAlignedReaders.NullableDictionaryIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableIntVector) valueVec, schemaElement);
        }
        switch (convertedType) {
          case DECIMAL:
            return new NullableFixedByteAlignedReaders.NullableDictionaryVarDecimalReader(parentReader,
                columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
          case TIME_MILLIS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryTimeReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableTimeVector)valueVec, schemaElement);
          default:
            throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT32");
        }
      case INT64:
        if (convertedType == null) {
          return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableBigIntVector)valueVec, schemaElement);
        }
        switch (convertedType) {
          case DECIMAL:
            return new NullableFixedByteAlignedReaders.NullableDictionaryVarDecimalReader(parentReader,
                columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
          case TIMESTAMP_MILLIS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableTimeStampVector)valueVec, schemaElement);
          // DRILL-6670: handle TIMESTAMP_MICROS as INT64 with no logical type
          case TIMESTAMP_MICROS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableBigIntVector)valueVec, schemaElement);
          default:
            throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT64");
        }
      case INT96:
        // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
        if (parentReader.getFragmentContext().getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
          return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableTimeStampVector) valueVec, schemaElement);
        } else {
          return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableVarBinaryVector) valueVec, schemaElement);
        }
      case FLOAT:
        return new NullableFixedByteAlignedReaders.NullableDictionaryFloat4Reader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableFloat4Vector)valueVec, schemaElement);
      case DOUBLE:
        return new NullableFixedByteAlignedReaders.NullableDictionaryFloat8Reader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableFloat8Vector)valueVec, schemaElement);
      default:
        throw new ExecutionSetupException("Unsupported nullable column type " + columnDescriptor.getType().name() );
    }
  }
}
 
Example 8
Source File: PageIterator.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
static <T> PageIterator<T> newIterator(ColumnDescriptor desc, String writerVersion) {
  switch (desc.getType()) {
    case BOOLEAN:
      return (PageIterator<T>) new PageIterator<Boolean>(desc, writerVersion) {
        @Override
        public Boolean next() {
          return nextBoolean();
        }
      };
    case INT32:
      return (PageIterator<T>) new PageIterator<Integer>(desc, writerVersion) {
        @Override
        public Integer next() {
          return nextInteger();
        }
      };
    case INT64:
      return (PageIterator<T>) new PageIterator<Long>(desc, writerVersion) {
        @Override
        public Long next() {
          return nextLong();
        }
      };
    case FLOAT:
      return (PageIterator<T>) new PageIterator<Float>(desc, writerVersion) {
        @Override
        public Float next() {
          return nextFloat();
        }
      };
    case DOUBLE:
      return (PageIterator<T>) new PageIterator<Double>(desc, writerVersion) {
        @Override
        public Double next() {
          return nextDouble();
        }
      };
    case FIXED_LEN_BYTE_ARRAY:
    case BINARY:
      return (PageIterator<T>) new PageIterator<Binary>(desc, writerVersion) {
        @Override
        public Binary next() {
          return nextBinary();
        }
      };
    default:
      throw new UnsupportedOperationException("Unsupported primitive type: " + desc.getType());
  }
}
 
Example 9
Source File: ColumnIterator.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
static <T> ColumnIterator<T> newIterator(ColumnDescriptor desc, String writerVersion) {
  switch (desc.getType()) {
    case BOOLEAN:
      return (ColumnIterator<T>) new ColumnIterator<Boolean>(desc, writerVersion) {
        @Override
        public Boolean next() {
          return nextBoolean();
        }
      };
    case INT32:
      return (ColumnIterator<T>) new ColumnIterator<Integer>(desc, writerVersion) {
        @Override
        public Integer next() {
          return nextInteger();
        }
      };
    case INT64:
      return (ColumnIterator<T>) new ColumnIterator<Long>(desc, writerVersion) {
        @Override
        public Long next() {
          return nextLong();
        }
      };
    case FLOAT:
      return (ColumnIterator<T>) new ColumnIterator<Float>(desc, writerVersion) {
        @Override
        public Float next() {
          return nextFloat();
        }
      };
    case DOUBLE:
      return (ColumnIterator<T>) new ColumnIterator<Double>(desc, writerVersion) {
        @Override
        public Double next() {
          return nextDouble();
        }
      };
    case FIXED_LEN_BYTE_ARRAY:
    case BINARY:
      return (ColumnIterator<T>) new ColumnIterator<Binary>(desc, writerVersion) {
        @Override
        public Binary next() {
          return nextBinary();
        }
      };
    default:
      throw new UnsupportedOperationException("Unsupported primitive type: " + desc.getType());
  }
}
 
Example 10
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private static void createDictionaryFile(FileSystem fs, Path dictionaryFile, ColumnDescriptor columnDescriptor, List<Dictionary> dictionaries,
                                         VectorContainer existingDict, BufferAllocator bufferAllocator) throws IOException {
  try (final OutputStream out = fs.create(dictionaryFile, true)) {
    switch (columnDescriptor.getType()) {
      case INT32: {
        try (final VectorContainer dict = buildIntegerGlobalDictionary(dictionaries, existingDict, columnDescriptor, bufferAllocator)) {
          writeDictionary(out, dict, dict.getRecordCount(), bufferAllocator);
        }
      }
      break;

      case INT64: {
        try (final VectorContainer dict = buildLongGlobalDictionary(dictionaries, existingDict, columnDescriptor, bufferAllocator)) {
          writeDictionary(out, dict, dict.getRecordCount(), bufferAllocator);
        }
      }
      break;

      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY: {
        try (final VectorContainer dict = buildBinaryGlobalDictionary(dictionaries, existingDict, columnDescriptor, bufferAllocator)) {
          writeDictionary(out, dict, dict.getRecordCount(), bufferAllocator);
        }
      }
      break;

      case FLOAT: {
        try (final VectorContainer dict = buildFloatGlobalDictionary(dictionaries, existingDict, columnDescriptor, bufferAllocator)) {
          writeDictionary(out, dict, dict.getRecordCount(), bufferAllocator);
        }
      }
      break;

      case DOUBLE: {
        try (final VectorContainer dict = buildDoubleGlobalDictionary(dictionaries, existingDict, columnDescriptor, bufferAllocator)) {
          writeDictionary(out, dict, dict.getRecordCount(), bufferAllocator);
        }
      }
      break;

      default:
        throw new IOException("Invalid data type " + columnDescriptor.getType());
    }
  }
}
 
Example 11
Source File: ColumnReaderFactory.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
public static NullableColumnReader<?> getNullableColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize,
                                                              ColumnDescriptor columnDescriptor,
                                                              ColumnChunkMetaData columnChunkMetaData,
                                                              boolean fixedLength,
                                                              ValueVector valueVec,
                                                              SchemaElement schemaElement) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();

  if (! columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
    if (columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.INT96) {
       // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
      if (parentReader.readInt96AsTimeStamp()) {
        return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (TimeStampMilliVector) valueVec, schemaElement);
      } else {
        return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (VarBinaryVector) valueVec, schemaElement);
      }
    }else{
      return new NullableFixedByteAlignedReaders.NullableFixedByteAlignedReader<>(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, valueVec, schemaElement);
    }
  } else {
    switch (columnDescriptor.getType()) {
      case INT32:
        if (convertedType == null) {
          return new NullableFixedByteAlignedReaders.NullableDictionaryIntReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (IntVector) valueVec, schemaElement);
        }
        switch (convertedType) {
          case DECIMAL:
            return new NullableFixedByteAlignedReaders.NullableDictionaryDecimal9Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (DecimalVector) valueVec, schemaElement);
          case TIME_MILLIS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryTimeReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (TimeMilliVector)valueVec, schemaElement);
          default:
            throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT32");
        }
      case INT64:
        if (convertedType == null) {
          return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (BigIntVector)valueVec, schemaElement);
        }
        switch (convertedType) {
          case DECIMAL:
            return new NullableFixedByteAlignedReaders.NullableDictionaryDecimal18Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (DecimalVector)valueVec, schemaElement);
          case TIMESTAMP_MILLIS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (TimeStampMilliVector)valueVec, schemaElement);
          default:
            throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT64");
        }
      case INT96:
        // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
        if (parentReader.readInt96AsTimeStamp()) {
          return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (TimeStampMilliVector) valueVec, schemaElement);
        } else {
          return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (VarBinaryVector) valueVec, schemaElement);
        }
      case FLOAT:
        return new NullableFixedByteAlignedReaders.NullableDictionaryFloat4Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (Float4Vector)valueVec, schemaElement);
      case DOUBLE:
        return new NullableFixedByteAlignedReaders.NullableDictionaryFloat8Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (Float8Vector)valueVec, schemaElement);
      default:
        throw new ExecutionSetupException("Unsupported nullable column type " + columnDescriptor.getType().name() );
    }
  }
}
 
Example 12
Source File: DumpCommand.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static void dump(PrettyPrintWriter out, ColumnReadStoreImpl crstore, ColumnDescriptor column, long page, long total, long offset) throws IOException {
    int dmax = column.getMaxDefinitionLevel();
    ColumnReader creader = crstore.getColumnReader(column);
    out.format("*** row group %d of %d, values %d to %d ***%n", page, total, offset, offset + creader.getTotalValueCount() - 1);

    for (long i = 0, e = creader.getTotalValueCount(); i < e; ++i) {
        int rlvl = creader.getCurrentRepetitionLevel();
        int dlvl = creader.getCurrentDefinitionLevel();

        out.format("value %d: R:%d D:%d V:", offset+i, rlvl, dlvl);
        if (dlvl == dmax) {
          PrimitiveStringifier stringifier =  column.getPrimitiveType().stringifier();
          switch (column.getType()) {
            case FIXED_LEN_BYTE_ARRAY:
            case INT96:
            case BINARY:
              out.print(stringifier.stringify(creader.getBinary()));
              break;
            case BOOLEAN:
              out.print(stringifier.stringify(creader.getBoolean()));
              break;
            case DOUBLE:
              out.print(stringifier.stringify(creader.getDouble()));
              break;
            case FLOAT:
              out.print(stringifier.stringify(creader.getFloat()));
              break;
            case INT32:
              out.print(stringifier.stringify(creader.getInteger()));
              break;
            case INT64:
              out.print(stringifier.stringify(creader.getLong()));
              break;
          }
        } else {
            out.format("<null>");
        }

        out.println();
        creader.consume();
    }
}