Java Code Examples for org.apache.parquet.schema.PrimitiveType#getPrimitiveTypeName()

The following examples show how to use org.apache.parquet.schema.PrimitiveType#getPrimitiveTypeName() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ColumnIndexValidator.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
static StatValue.Builder getBuilder(PrimitiveType type) {
  switch (type.getPrimitiveTypeName()) {
  case BINARY:
  case FIXED_LEN_BYTE_ARRAY:
  case INT96:
    return new BinaryStatValueBuilder(type);
  case BOOLEAN:
    return new BooleanStatValueBuilder(type);
  case DOUBLE:
    return new DoubleStatValueBuilder(type);
  case FLOAT:
    return new FloatStatValueBuilder(type);
  case INT32:
    return new IntStatValueBuilder(type);
  case INT64:
    return new LongStatValueBuilder(type);
  default:
    throw new IllegalArgumentException("Unsupported type: " + type);
  }
}
 
Example 2
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Changes the list inner '$data$' vector name to 'element' in the schema
 */
private Type renameChildTypeToElement(Type childType) {
  if (childType.isPrimitive()) {
    PrimitiveType childPrimitiveType = childType.asPrimitiveType();
    return new PrimitiveType(childType.getRepetition(),
      childPrimitiveType.getPrimitiveTypeName(),
      childPrimitiveType.getTypeLength(),
      "element",
      childPrimitiveType.getOriginalType(),
      childPrimitiveType.getDecimalMetadata(),
      childPrimitiveType.getId());
  } else {
    GroupType childGroupType = childType.asGroupType();
    Type.ID id = childGroupType.getId();
    GroupType groupType = new GroupType(childType.getRepetition(),
      "element",
      childType.getOriginalType(),
      childGroupType.getFields());
    if (id != null) {
      groupType = groupType.withId(id.hashCode());
    }
    return groupType;
  }
}
 
Example 3
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  OriginalType otype = type.getOriginalType();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (otype != null) out.format(" O:%s", otype);

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[0]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
Example 4
Source File: ColumnIndexBuilder.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static ColumnIndexBuilder createNewBuilder(PrimitiveType type, int truncateLength) {
  switch (type.getPrimitiveTypeName()) {
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
    case INT96:
      return new BinaryColumnIndexBuilder(type, truncateLength);
    case BOOLEAN:
      return new BooleanColumnIndexBuilder();
    case DOUBLE:
      return new DoubleColumnIndexBuilder();
    case FLOAT:
      return new FloatColumnIndexBuilder();
    case INT32:
      return new IntColumnIndexBuilder();
    case INT64:
      return new LongColumnIndexBuilder();
    default:
      throw new IllegalArgumentException("Unsupported type for column index: " + type);
  }
}
 
Example 5
Source File: Statistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Creates an empty {@code Statistics} instance for the specified type to be
 * used for reading/writing the new min/max statistics used in the V2 format.
 *
 * @param type
 *          type of the column
 * @return instance of a typed statistics class
 */
public static Statistics<?> createStats(Type type) {
  PrimitiveType primitive = type.asPrimitiveType();
  switch (primitive.getPrimitiveTypeName()) {
    case INT32:
      return new IntStatistics(primitive);
    case INT64:
      return new LongStatistics(primitive);
    case FLOAT:
      return new FloatStatistics(primitive);
    case DOUBLE:
      return new DoubleStatistics(primitive);
    case BOOLEAN:
      return new BooleanStatistics(primitive);
    case BINARY:
    case INT96:
    case FIXED_LEN_BYTE_ARRAY:
      return new BinaryStatistics(primitive);
    default:
      throw new UnknownColumnTypeException(primitive.getPrimitiveTypeName());
  }
}
 
Example 6
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static boolean isIntType(PrimitiveType primitiveType) {
  if (primitiveType.getOriginalType() != null) {
    switch (primitiveType.getOriginalType()) {
      case INT_8:
      case INT_16:
      case INT_32:
      case DATE:
        return true;
      default:
        return false;
    }
  }
  return primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT32;
}
 
Example 7
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
  String[] path = column.getPath().toArray();
  PrimitiveType type = primitive(schema, path);
  Preconditions.checkNotNull(type);

  ColumnDescriptor desc = schema.getColumnDescription(path);
  long size = column.getTotalSize();
  long count = column.getValueCount();
  float perValue = ((float) size) / count;
  CompressionCodecName codec = column.getCodec();
  Set<Encoding> encodings = column.getEncodings();
  EncodingStats encodingStats = column.getEncodingStats();
  String encodingSummary = encodingStats == null ?
      encodingsAsString(encodings, desc) :
      encodingStatsAsString(encodingStats);
  Statistics stats = column.getStatistics();

  String name = column.getPath().toDotString();

  PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
  if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
        name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
        humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  } else {
    console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s",
        name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
        stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  }
}
 
Example 8
Source File: TupleWriteSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void writePrimitive(TupleEntry record, PrimitiveType field) {
  switch (field.getPrimitiveTypeName()) {
    case BINARY:
      recordConsumer.addBinary(Binary.fromString(record.getString(field.getName())));
      break;
    case BOOLEAN:
      recordConsumer.addBoolean(record.getBoolean(field.getName()));
      break;
    case INT32:
      recordConsumer.addInteger(record.getInteger(field.getName()));
      break;
    case INT64:
      recordConsumer.addLong(record.getLong(field.getName()));
      break;
    case DOUBLE:
      recordConsumer.addDouble(record.getDouble(field.getName()));
      break;
    case FLOAT:
      recordConsumer.addFloat(record.getFloat(field.getName()));
      break;
    case FIXED_LEN_BYTE_ARRAY:
      throw new UnsupportedOperationException("Fixed len byte array type not implemented");
    case INT96:
      throw new UnsupportedOperationException("Int96 type not implemented");
    default:
      throw new UnsupportedOperationException(field.getName() + " type not implemented");
  }
}
 
Example 9
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
final public void start() {
  currentTuple = TF.newTuple(schemaSize);
  if (elephantBirdCompatible) {
    try {
      int i = 0;
      for (Type field : parquetSchema.getFields()) {
        if (field.isPrimitive() && field.isRepetition(Repetition.OPTIONAL)) {
          PrimitiveType primitiveType = field.asPrimitiveType();
          switch (primitiveType.getPrimitiveTypeName()) {
          case INT32:
            currentTuple.set(i, I32_ZERO);
            break;
          case INT64:
            currentTuple.set(i, I64_ZERO);
            break;
          case FLOAT:
            currentTuple.set(i, FLOAT_ZERO);
            break;
          case DOUBLE:
            currentTuple.set(i, DOUBLE_ZERO);
            break;
          case BOOLEAN:
            currentTuple.set(i, I32_ZERO);
            break;
          }
        }
        ++ i;
      }
    } catch (ExecException e) {
      throw new RuntimeException(e);
    }
  }
}
 
Example 10
Source File: ColumnIOFactory.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void visit(PrimitiveType primitiveType) {
  if (!currentRequestedType.isPrimitive() || 
          (this.strictTypeChecking && currentRequestedType.asPrimitiveType().getPrimitiveTypeName() != primitiveType.getPrimitiveTypeName())) {
    incompatibleSchema(primitiveType, currentRequestedType);
  }
  PrimitiveColumnIO newIO = new PrimitiveColumnIO(primitiveType, current, currentRequestedIndex, leaves.size());
  current.add(newIO);
  leaves.add(newIO);
}
 
Example 11
Source File: BinaryTruncator.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static BinaryTruncator getTruncator(PrimitiveType type) {
  if (type == null) {
    return NO_OP_TRUNCATOR;
  }
  switch (type.getPrimitiveTypeName()) {
    case INT96:
      return NO_OP_TRUNCATOR;
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
      LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation();
      if (logicalTypeAnnotation == null) {
        return DEFAULT_UTF8_TRUNCATOR;
      }
      return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<BinaryTruncator>() {
        @Override
        public Optional<BinaryTruncator> visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
          return Optional.of(DEFAULT_UTF8_TRUNCATOR);
        }

        @Override
        public Optional<BinaryTruncator> visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
          return Optional.of(DEFAULT_UTF8_TRUNCATOR);
        }

        @Override
        public Optional<BinaryTruncator> visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
          return Optional.of(DEFAULT_UTF8_TRUNCATOR);
        }

        @Override
        public Optional<BinaryTruncator> visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
          return Optional.of(DEFAULT_UTF8_TRUNCATOR);
        }
      }).orElse(NO_OP_TRUNCATOR);
    default:
      throw new IllegalArgumentException("No truncator is available for the type: " + type);
  }
}
 
Example 12
Source File: Statistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
 *
 * @param type
 *          type of the column
 * @return builder to create new statistics object
 */
public static Builder getBuilderForReading(PrimitiveType type) {
  switch (type.getPrimitiveTypeName()) {
    case FLOAT:
      return new FloatBuilder(type);
    case DOUBLE:
      return new DoubleBuilder(type);
    default:
      return new Builder(type);
  }
}
 
Example 13
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath, boolean showOriginalTypes) {
  String name = Strings.repeat(".", depth) + type.getName();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (showOriginalTypes) {
    OriginalType otype;
    try {
      otype = type.getOriginalType();
    } catch (Exception e) {
      otype = null;
    }
    if (otype != null) out.format(" O:%s", otype);
  } else {
    LogicalTypeAnnotation ltype = type.getLogicalTypeAnnotation();
    if (ltype != null) out.format(" L:%s", ltype);
  }

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[0]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
Example 14
Source File: ParquetConversions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Function<Object, Object> converterFromParquet(PrimitiveType type) {
  if (type.getOriginalType() != null) {
    switch (type.getOriginalType()) {
      case UTF8:
        // decode to CharSequence to avoid copying into a new String
        return binary -> Charsets.UTF_8.decode(((Binary) binary).toByteBuffer());
      case DECIMAL:
        int scale = type.getDecimalMetadata().getScale();
        switch (type.getPrimitiveTypeName()) {
          case INT32:
          case INT64:
            return num -> BigDecimal.valueOf(((Number) num).longValue(), scale);
          case FIXED_LEN_BYTE_ARRAY:
          case BINARY:
            return bin -> new BigDecimal(new BigInteger(((Binary) bin).getBytes()), scale);
          default:
            throw new IllegalArgumentException(
                "Unsupported primitive type for decimal: " + type.getPrimitiveTypeName());
        }
      default:
    }
  }

  switch (type.getPrimitiveTypeName()) {
    case FIXED_LEN_BYTE_ARRAY:
    case BINARY:
      return binary -> ByteBuffer.wrap(((Binary) binary).getBytes());
    default:
  }

  return obj -> obj;
}
 
Example 15
Source File: ParquetConversions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Function<Object, Object> converterFromParquet(PrimitiveType type) {
  if (type.getOriginalType() != null) {
    switch (type.getOriginalType()) {
      case UTF8:
        // decode to CharSequence to avoid copying into a new String
        return binary -> StandardCharsets.UTF_8.decode(((Binary) binary).toByteBuffer());
      case DECIMAL:
        int scale = type.getDecimalMetadata().getScale();
        switch (type.getPrimitiveTypeName()) {
          case INT32:
          case INT64:
            return num -> BigDecimal.valueOf(((Number) num).longValue(), scale);
          case FIXED_LEN_BYTE_ARRAY:
          case BINARY:
            return bin -> new BigDecimal(new BigInteger(((Binary) bin).getBytes()), scale);
          default:
            throw new IllegalArgumentException(
                "Unsupported primitive type for decimal: " + type.getPrimitiveTypeName());
        }
      default:
    }
  }

  switch (type.getPrimitiveTypeName()) {
    case FIXED_LEN_BYTE_ARRAY:
    case BINARY:
      return binary -> ByteBuffer.wrap(((Binary) binary).getBytes());
    default:
  }

  return obj -> obj;
}
 
Example 16
Source File: ParquetSplitReaderUtil.java    From flink with Apache License 2.0 4 votes vote down vote up
public static WritableColumnVector createWritableColumnVector(
		int batchSize,
		LogicalType fieldType,
		PrimitiveType primitiveType) {
	PrimitiveType.PrimitiveTypeName typeName = primitiveType.getPrimitiveTypeName();
	switch (fieldType.getTypeRoot()) {
		case BOOLEAN:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.BOOLEAN,
					"Unexpected type: %s", typeName);
			return new HeapBooleanVector(batchSize);
		case TINYINT:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.INT32,
					"Unexpected type: %s", typeName);
			return new HeapByteVector(batchSize);
		case DOUBLE:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.DOUBLE,
					"Unexpected type: %s", typeName);
			return new HeapDoubleVector(batchSize);
		case FLOAT:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.FLOAT,
					"Unexpected type: %s", typeName);
			return new HeapFloatVector(batchSize);
		case INTEGER:
		case DATE:
		case TIME_WITHOUT_TIME_ZONE:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.INT32,
					"Unexpected type: %s", typeName);
			return new HeapIntVector(batchSize);
		case BIGINT:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.INT64,
					"Unexpected type: %s", typeName);
			return new HeapLongVector(batchSize);
		case SMALLINT:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.INT32,
					"Unexpected type: %s", typeName);
			return new HeapShortVector(batchSize);
		case CHAR:
		case VARCHAR:
		case BINARY:
		case VARBINARY:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.BINARY,
					"Unexpected type: %s", typeName);
			return new HeapBytesVector(batchSize);
		case TIMESTAMP_WITHOUT_TIME_ZONE:
		case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
			checkArgument(
					typeName == PrimitiveType.PrimitiveTypeName.INT96,
					"Unexpected type: %s", typeName);
			return new HeapTimestampVector(batchSize);
		case DECIMAL:
			DecimalType decimalType = (DecimalType) fieldType;
			if (DecimalDataUtils.is32BitDecimal(decimalType.getPrecision())) {
				checkArgument(
						(typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY ||
								typeName == PrimitiveType.PrimitiveTypeName.INT32) &&
								primitiveType.getOriginalType() == OriginalType.DECIMAL,
						"Unexpected type: %s", typeName);
				return new HeapIntVector(batchSize);
			} else if (DecimalDataUtils.is64BitDecimal(decimalType.getPrecision())) {
				checkArgument(
						(typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY ||
								typeName == PrimitiveType.PrimitiveTypeName.INT64) &&
								primitiveType.getOriginalType() == OriginalType.DECIMAL,
						"Unexpected type: %s", typeName);
				return new HeapLongVector(batchSize);
			} else {
				checkArgument(
						(typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY ||
								typeName == PrimitiveType.PrimitiveTypeName.BINARY) &&
								primitiveType.getOriginalType() == OriginalType.DECIMAL,
						"Unexpected type: %s", typeName);
				return new HeapBytesVector(batchSize);
			}
		default:
			throw new UnsupportedOperationException(fieldType + " is not supported now.");
	}
}
 
Example 17
Source File: ParquetTypeHelper.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Returns an arrow vector field for a parquet primitive field
 *
 * @param colPath       schema path of the column
 * @param primitiveType parquet primitive type
 * @param originalType  parquet original type
 * @param schemaHelper  schema helper used for type conversions
 * @return arrow vector field
 */
public static Field createField(SchemaPath colPath,
                                PrimitiveType primitiveType,
                                OriginalType originalType,
                                SchemaDerivationHelper schemaHelper) {
  final String colName = colPath.getAsNamePart().getName();
  switch (primitiveType.getPrimitiveTypeName()) {
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
      if (originalType == OriginalType.UTF8) {
        return CompleteType.VARCHAR.toField(colName);
      }
      if (originalType == OriginalType.DECIMAL) {

        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      if (schemaHelper.isVarChar(colPath)) {
        return CompleteType.VARCHAR.toField(colName);
      }
      return CompleteType.VARBINARY.toField(colName);
    case BOOLEAN:
      return CompleteType.BIT.toField(colName);
    case DOUBLE:
      return CompleteType.DOUBLE.toField(colName);
    case FLOAT:
      return CompleteType.FLOAT.toField(colName);
    case INT32:
      if (originalType == OriginalType.DATE) {
        return CompleteType.DATE.toField(colName);
      } else if (originalType == OriginalType.TIME_MILLIS) {
        return CompleteType.TIME.toField(colName);
      } else if (originalType == OriginalType.DECIMAL) {
        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      return CompleteType.INT.toField(colName);
    case INT64:
      if (originalType == OriginalType.TIMESTAMP_MILLIS) {
        return CompleteType.TIMESTAMP.toField(colName);
      } else if (originalType == OriginalType.DECIMAL) {
        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      return CompleteType.BIGINT.toField(colName);
    case INT96:
      if (schemaHelper.readInt96AsTimeStamp()) {
        return CompleteType.TIMESTAMP.toField(colName);
      }
      return CompleteType.VARBINARY.toField(colName);
    default:
      throw UserException.unsupportedError()
        .message("Parquet Primitive Type '%s', Original Type '%s' combination not supported. Column '%s'",
          primitiveType.toString(), originalType != null ? originalType : "Not Available", colName)
        .build();
  }
}
 
Example 18
Source File: ArrowVectorAccessors.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@NotNull
private static ArrowVectorAccessor getDictionaryVectorAccessor(
    Dictionary dictionary,
    ColumnDescriptor desc,
    FieldVector vector, PrimitiveType primitive) {
  Preconditions.checkState(vector instanceof IntVector, "Dictionary ids should be stored in IntVectors only");
  if (primitive.getOriginalType() != null) {
    switch (desc.getPrimitiveType().getOriginalType()) {
      case ENUM:
      case JSON:
      case UTF8:
      case BSON:
        return new DictionaryStringAccessor((IntVector) vector, dictionary);
      case INT_64:
      case TIMESTAMP_MILLIS:
      case TIMESTAMP_MICROS:
        return new DictionaryLongAccessor((IntVector) vector, dictionary);
      case DECIMAL:
        switch (primitive.getPrimitiveTypeName()) {
          case BINARY:
          case FIXED_LEN_BYTE_ARRAY:
            return new DictionaryDecimalBinaryAccessor(
                (IntVector) vector,
                dictionary);
          case INT64:
            return new DictionaryDecimalLongAccessor(
                (IntVector) vector,
                dictionary);
          case INT32:
            return new DictionaryDecimalIntAccessor(
                (IntVector) vector,
                dictionary);
          default:
            throw new UnsupportedOperationException(
                "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
        }
      default:
        throw new UnsupportedOperationException(
            "Unsupported logical type: " + primitive.getOriginalType());
    }
  } else {
    switch (primitive.getPrimitiveTypeName()) {
      case FIXED_LEN_BYTE_ARRAY:
      case BINARY:
        return new DictionaryBinaryAccessor((IntVector) vector, dictionary);
      case FLOAT:
        return new DictionaryFloatAccessor((IntVector) vector, dictionary);
      case INT64:
        return new DictionaryLongAccessor((IntVector) vector, dictionary);
      case DOUBLE:
        return new DictionaryDoubleAccessor((IntVector) vector, dictionary);
      default:
        throw new UnsupportedOperationException("Unsupported type: " + primitive);
    }
  }
}
 
Example 19
Source File: ShowDictionaryCommand.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);

  ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
  MessageType schema = reader.getFileMetaData().getSchema();
  ColumnDescriptor descriptor = Util.descriptor(column, schema);
  PrimitiveType type = Util.primitive(column, schema);
  Preconditions.checkNotNull(type);

  DictionaryPageReadStore dictionaryReader;
  int rowGroup = 0;
  while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
    DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);

    Dictionary dict = page.getEncoding().initDictionary(descriptor, page);

    console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
    for (int i = 0; i <= dict.getMaxId(); i += 1) {
      switch(type.getPrimitiveTypeName()) {
        case BINARY:
          if (type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) {
            console.info("{}: {}", String.format("%6d", i),
                Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
          } else {
            console.info("{}: {}", String.format("%6d", i),
                Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
          }
          break;
        case INT32:
          console.info("{}: {}", String.format("%6d", i),
            dict.decodeToInt(i));
          break;
        case INT64:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToLong(i));
          break;
        case FLOAT:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToFloat(i));
          break;
        case DOUBLE:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToDouble(i));
          break;
        default:
          throw new IllegalArgumentException(
              "Unknown dictionary type: " + type.getPrimitiveTypeName());
      }
    }

    reader.skipNextRowGroup();

    rowGroup += 1;
  }

  console.info("");

  return 0;
}
 
Example 20
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 3 votes vote down vote up
/**
 * Returns whether to use signed order min and max with a type. It is safe to
 * use signed min and max when the type is a string type and contains only
 * ASCII characters (where the sign bit was 0). This checks whether the type
 * is a string type and uses {@code useSignedStringMinMax} to determine if
 * only ASCII characters were written.
 *
 * @param type a primitive type with a logical type annotation
 * @return true if signed order min/max can be used with this type
 */
private boolean overrideSortOrderToSigned(PrimitiveType type) {
  // even if the override is set, only return stats for string-ish types
  // a null type annotation is considered string-ish because some writers
  // failed to use the UTF8 annotation.
  LogicalTypeAnnotation annotation = type.getLogicalTypeAnnotation();
  return useSignedStringMinMax &&
      PrimitiveTypeName.BINARY == type.getPrimitiveTypeName() &&
      (annotation == null || STRING_TYPES.contains(annotation.getClass()));
}