org.apache.parquet.schema.OriginalType Java Examples
The following examples show how to use
org.apache.parquet.schema.OriginalType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroSchemaConverter190Int96Avro17.java From datacollector with Apache License 2.0 | 7 votes |
private OriginalType convertLogicalTypeStr(String logicalType) { if (logicalType == null) { return null; } else if (AvroTypeUtil.LOGICAL_TYPE_DECIMAL.equals(logicalType)) { return OriginalType.DECIMAL; } else if (AvroTypeUtil.LOGICAL_TYPE_DATE.equals(logicalType)) { return OriginalType.DATE; } else if (AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS.equals(logicalType)) { return OriginalType.TIME_MILLIS; // } else if (AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS.equals(logicalType)) { // return OriginalType.TIME_MICROS; } else if (AvroTypeUtil.LOGICAL_TYPE_TIMESTAMP_MILLIS.equals(logicalType)) { return OriginalType.TIMESTAMP_MILLIS; // } else if (AvroTypeUtil.LOGICAL_TYPE_TIMESTAMP_MICROS.equals(logicalType)) { // return OriginalType.TIMESTAMP_MICROS; } return null; }
Example #2
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testV2StatsEqualMinMax() { testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), 93, 93); testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), -5892, -5892); testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), 234998934, 234998934); testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), -2389943895984985L, -2389943895984985L); testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), new BigInteger("823749"), new BigInteger("823749")); testV2StatsEqualMinMax( Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7) .named(""), new BigInteger("-8752832"), new BigInteger("-8752832")); testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT96).named(""), new BigInteger("81032984"), new BigInteger("81032984")); }
Example #3
Source File: ParquetRecordWriter.java From Bats with Apache License 2.0 | 6 votes |
protected PrimitiveType getPrimitiveType(MaterializedField field) { MinorType minorType = field.getType().getMinorType(); String name = field.getName(); int length = ParquetTypeHelper.getLengthForMinorType(minorType); PrimitiveTypeName primitiveTypeName = ParquetTypeHelper.getPrimitiveTypeNameForMinorType(minorType); if (Types.isDecimalType(minorType)) { primitiveTypeName = logicalTypeForDecimals; if (usePrimitiveTypesForDecimals) { if (field.getPrecision() <= ParquetTypeHelper.getMaxPrecisionForPrimitiveType(PrimitiveTypeName.INT32)) { primitiveTypeName = PrimitiveTypeName.INT32; } else if (field.getPrecision() <= ParquetTypeHelper.getMaxPrecisionForPrimitiveType(PrimitiveTypeName.INT64)) { primitiveTypeName = PrimitiveTypeName.INT64; } } length = DecimalUtility.getMaxBytesSizeForPrecision(field.getPrecision()); } Repetition repetition = ParquetTypeHelper.getRepetitionForDataMode(field.getDataMode()); OriginalType originalType = ParquetTypeHelper.getOriginalTypeForMinorType(minorType); DecimalMetadata decimalMetadata = ParquetTypeHelper.getDecimalMetadataForField(field); return new PrimitiveType(repetition, primitiveTypeName, length, name, originalType, decimalMetadata, null); }
Example #4
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helper) { ParquetMetadataConverter converter = new ParquetMetadataConverter(); BinaryStatistics stats = new BinaryStatistics(); stats.incrementNumNulls(); stats.updateStats(Binary.fromString("A")); stats.incrementNumNulls(); stats.updateStats(Binary.fromString("A")); stats.incrementNumNulls(); PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, ParquetMetadataConverter.toParquetStatistics(stats), binaryType); Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty()); Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes()); }
Example #5
Source File: ParquetGroupConverter.java From dremio-oss with Apache License 2.0 | 6 votes |
Converter groupConverterFromArrowSchema(String fieldName, String groupTypeName, GroupType groupType, Collection<SchemaPath> c) { final String nameForChild = getNameForChild(fieldName); final Field arrowField = Schema.findField(arrowSchema, groupTypeName); final ArrowTypeID arrowTypeType = arrowField.getType().getTypeID(); final List<Field> arrowChildren = arrowField.getChildren(); if (arrowTypeType == ArrowTypeID.Union) { // if it's a union we will add the children directly to the parent return new UnionGroupConverter(columnResolver, fieldName, mutator, getWriterProvider(), groupType, c, options, arrowChildren, nameForChild, schemaHelper); } else if (arrowTypeType == ArrowTypeID.List) { // make sure the parquet schema matches the arrow schema and delegate handling the logical list to defaultGroupConverter() Preconditions.checkState(groupType.getOriginalType() == OriginalType.LIST, "parquet schema doesn't match the arrow schema for LIST " + nameForChild); } return defaultGroupConverter(fieldName, mutator, groupType, c, arrowChildren); }
Example #6
Source File: TestPigSchemaConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testListsOfPrimitive() throws Exception { for (Type.Repetition repetition : Type.Repetition.values()) { for (Type.Repetition valueRepetition : Type.Repetition.values()) { for (PrimitiveType.PrimitiveTypeName primitiveTypeName : PrimitiveType.PrimitiveTypeName.values()) { if (primitiveTypeName != PrimitiveType.PrimitiveTypeName.INT96) { // INT96 is NYI Types.PrimitiveBuilder<PrimitiveType> value = Types.primitive(primitiveTypeName, valueRepetition); if (primitiveTypeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) value.length(1); GroupType type = Types.buildGroup(repetition).addField(value.named("b")).as(OriginalType.LIST).named("a"); pigSchemaConverter.convertField(type); // no exceptions, please } } } } }
Example #7
Source File: TestMetadataReader.java From presto with Apache License 2.0 | 6 votes |
@Test(dataProvider = "allCreatedBy") public void testReadStatsBinaryUtf8(Optional<String> fileCreatedBy) { PrimitiveType varchar = new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8); Statistics statistics; // Stats written by Parquet after https://issues.apache.org/jira/browse/PARQUET-1025 statistics = new Statistics(); statistics.setNull_count(13); statistics.setMin_value("a".getBytes(UTF_8)); statistics.setMax_value("é".getBytes(UTF_8)); assertThat(MetadataReader.readStats(fileCreatedBy, Optional.of(statistics), varchar)) .isInstanceOfSatisfying(BinaryStatistics.class, columnStatistics -> { assertEquals(columnStatistics.getNumNulls(), 13); assertEquals(columnStatistics.getMin().getBytes(), new byte[] {'a'}); assertEquals(columnStatistics.getMax().getBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9}); assertEquals(columnStatistics.getMinBytes(), new byte[] {'a'}); assertEquals(columnStatistics.getMaxBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9}); assertEquals(columnStatistics.genericGetMin().getBytes(), new byte[] {'a'}); assertEquals(columnStatistics.genericGetMax().getBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9}); }); }
Example #8
Source File: TestParquetParser.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testIDs() { String message = "message Message {\n" + " required binary string (UTF8) = 6;\n" + " required int32 i=1;\n" + " required binary s2= 3;\n" + " required binary s3 =4;\n" + "}\n"; MessageType parsed = parseMessageType(message); MessageType expected = buildMessage() .required(BINARY).as(OriginalType.UTF8).id(6).named("string") .required(INT32).id(1).named("i") .required(BINARY).id(3).named("s2") .required(BINARY).id(4).named("s3") .named("Message"); assertEquals(expected, parsed); MessageType reparsed = parseMessageType(parsed.toString()); assertEquals(expected, reparsed); }
Example #9
Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License | 6 votes |
static private List<Type> typeInfoToParquetTypes(final List<ExaParquetTypeInfo> exaParquetTypeInfos) { List<Type> types = new ArrayList<>(); for (ExaParquetTypeInfo exaType: exaParquetTypeInfos) { if (exaType.length != 0) { types.add(new PrimitiveType( Type.Repetition.valueOf(exaType.typeRepitition), PrimitiveType.PrimitiveTypeName.valueOf(exaType.primitiveTypeName), exaType.length, exaType.name)); } else { types.add(new PrimitiveType( Type.Repetition.valueOf(exaType.typeRepitition), PrimitiveType.PrimitiveTypeName.valueOf(exaType.primitiveTypeName), exaType.name, exaType.originalType == null ? null : OriginalType.valueOf(exaType.originalType))); } } return types; }
Example #10
Source File: AvroSchemaConverter190Int96Avro18.java From datacollector with Apache License 2.0 | 6 votes |
private OriginalType convertLogicalType(LogicalType logicalType) { if (logicalType == null) { return null; } else if (logicalType instanceof LogicalTypes.Decimal) { return OriginalType.DECIMAL; } else if (logicalType instanceof LogicalTypes.Date) { return OriginalType.DATE; } else if (logicalType instanceof LogicalTypes.TimeMillis) { return OriginalType.TIME_MILLIS; } else if (logicalType instanceof LogicalTypes.TimeMicros) { return OriginalType.TIME_MICROS; } else if (logicalType instanceof LogicalTypes.TimestampMillis) { return OriginalType.TIMESTAMP_MILLIS; } else if (logicalType instanceof LogicalTypes.TimestampMicros) { return OriginalType.TIMESTAMP_MICROS; } return null; }
Example #11
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testIgnoreStatsWithSignedSortOrder() { ParquetMetadataConverter converter = new ParquetMetadataConverter(); BinaryStatistics stats = new BinaryStatistics(); stats.incrementNumNulls(); stats.updateStats(Binary.fromString("A")); stats.incrementNumNulls(); stats.updateStats(Binary.fromString("z")); stats.incrementNumNulls(); PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY) .as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, StatsHelper.V1.toParquetStatistics(stats), binaryType); Assert.assertFalse("Stats should not include min/max: " + convertedStats, convertedStats.hasNonNullValue()); Assert.assertTrue("Stats should have null count: " + convertedStats, convertedStats.isNumNullsSet()); Assert.assertEquals("Stats should have 3 nulls: " + convertedStats, 3L, convertedStats.getNumNulls()); }
Example #12
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testV2OnlyStats() { testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), 0x7F, 0x80); testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), 0x7FFF, 0x8000); testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), 0x7FFFFFFF, 0x80000000); testV2OnlyStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), 0x7FFFFFFFFFFFFFFFL, 0x8000000000000000L); testV2OnlyStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), new BigInteger("-765875"), new BigInteger("876856")); testV2OnlyStats( Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7) .named(""), new BigInteger("-6769643"), new BigInteger("9864675")); }
Example #13
Source File: AvroSchemaConverter190Int96Avro17.java From datacollector with Apache License 2.0 | 6 votes |
private Schema addLogicalTypeStrToSchema( Schema schema, OriginalType annotation, PrimitiveType asPrimitive, PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName ) { Map<String, String> logicalType = convertOriginalTypeToMap(annotation, asPrimitive.getDecimalMetadata()); if (logicalType != null && (annotation != DECIMAL || parquetPrimitiveTypeName == BINARY || parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) { for(Map.Entry<String, String> entry : logicalType.entrySet()) { schema.addProp(entry.getKey(), entry.getValue()); } } return schema; }
Example #14
Source File: ParquetTableMetadataUtils.java From Bats with Apache License 2.0 | 6 votes |
/** * Populates the non-interesting column's statistics * @param schemaPaths columns paths which should be ignored * @param parquetTableMetadata the source of column metadata for non-interesting column's statistics * @return returns non-interesting column statistics map */ @SuppressWarnings("unchecked") public static Map<SchemaPath, ColumnStatistics> populateNonInterestingColumnsStats( Set<SchemaPath> schemaPaths, MetadataBase.ParquetTableMetadataBase parquetTableMetadata) { Map<SchemaPath, ColumnStatistics> columnsStatistics = new HashMap<>(); if (parquetTableMetadata instanceof Metadata_V4.ParquetTableMetadata_v4) { for (Metadata_V4.ColumnTypeMetadata_v4 columnTypeMetadata : ((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).getColumnTypeInfoMap().values()) { SchemaPath schemaPath = SchemaPath.getCompoundPath(columnTypeMetadata.name); if (!schemaPaths.contains(schemaPath)) { Map<StatisticsKind, Object> statistics = new HashMap<>(); statistics.put(ColumnStatisticsKind.NULLS_COUNT, GroupScan.NO_COLUMN_STATS); PrimitiveType.PrimitiveTypeName primitiveType = columnTypeMetadata.primitiveType; OriginalType originalType = columnTypeMetadata.originalType; Comparator comparator = getComparator(primitiveType, originalType); columnsStatistics.put(schemaPath, new ColumnStatisticsImpl<>(statistics, comparator)); } } } return columnsStatistics; }
Example #15
Source File: MapKeyValuesSchemaConverter.java From presto with Apache License 2.0 | 5 votes |
private static GroupType listWrapper(Repetition repetition, String alias, OriginalType originalType, Type nested) { if (!nested.isRepetition(Repetition.REPEATED)) { throw new IllegalArgumentException("Nested type should be repeated: " + nested); } return new GroupType(repetition, alias, originalType, nested); }
Example #16
Source File: ParquetResolverTest.java From pxf with Apache License 2.0 | 5 votes |
private void testSetFields_RightTrimCharHelper(String varchar, String inputChar, String expectedChar) throws IOException { List<Type> typeFields = new ArrayList<>(); typeFields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "vc1", OriginalType.UTF8)); typeFields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "c1", OriginalType.UTF8)); schema = new MessageType("hive_schema", typeFields); context.setMetadata(schema); List<ColumnDescriptor> columnDescriptors = new ArrayList<>(); columnDescriptors.add(new ColumnDescriptor("vc1", DataType.VARCHAR.getOID(), 0, "varchar", null)); columnDescriptors.add(new ColumnDescriptor("c1", DataType.BPCHAR.getOID(), 1, "char", null)); context.setTupleDescription(columnDescriptors); resolver.initialize(context); List<OneField> fields = new ArrayList<>(); fields.add(new OneField(DataType.TEXT.getOID(), varchar)); // the whitespace on the after 'abc ' needs to be trimmed fields.add(new OneField(DataType.TEXT.getOID(), inputChar)); OneRow row = resolver.setFields(fields); assertNotNull(row); Object data = row.getData(); assertNotNull(data); assertTrue(data instanceof Group); Group group = (Group) data; // assert column values assertEquals(varchar, group.getString(0, 0)); assertEquals(expectedChar, group.getString(1, 0)); // assert value repetition count for (int i = 0; i < 2; i++) { assertEquals(1, group.getFieldRepetitionCount(i)); } }
Example #17
Source File: ParquetGroupConverter.java From dremio-oss with Apache License 2.0 | 5 votes |
Converter defaultGroupConverter(String fieldName, OutputMutator mutator, GroupType groupType, Collection<SchemaPath> c, List<Field> arrowSchema) { if (groupType.getOriginalType() == OriginalType.LIST && LogicalListL1Converter.isSupportedSchema(groupType)) { return new LogicalListL1Converter( columnResolver, fieldName, mutator, getWriterProvider(), groupType, c, options, arrowSchema, schemaHelper ); } final String nameForChild = getNameForChild(columnResolver.getBatchSchemaColumnName(fieldName)); final StructWriter struct; if (groupType.isRepetition(REPEATED)) { if (arrowSchema != null) { //TODO assert this should never occur at this level // only parquet writer that writes arrowSchema doesn't write repeated fields except // as part of a LOGICAL LIST, thus this scenario (repeated + arrow schema present) can // only happen in LogicalList converter arrowSchema = handleRepeatedField(arrowSchema, groupType); } struct = list(nameForChild).struct(); } else { struct = getWriterProvider().struct(nameForChild); } return new StructGroupConverter(columnResolver, fieldName, mutator, struct, groupType, c, options, arrowSchema, schemaHelper); }
Example #18
Source File: ParquetRowiseReader.java From dremio-oss with Apache License 2.0 | 5 votes |
private void verifyDecimalTypesAreSame(OutputMutator output, ParquetColumnResolver columnResolver) { for (ValueVector vector : output.getVectors()) { Field fieldInSchema = vector.getField(); if (fieldInSchema.getType().getTypeID() == ArrowType.ArrowTypeID.Decimal) { ArrowType.Decimal typeInTable = (ArrowType.Decimal) fieldInSchema.getType(); Type typeInParquet = null; // the field in arrow schema may not be present in hive schema try { typeInParquet = schema.getType(columnResolver.getParquetColumnName(fieldInSchema.getName())); } catch (InvalidRecordException e) { } if (typeInParquet == null) { continue; } boolean schemaMisMatch = true; OriginalType originalType = typeInParquet.getOriginalType(); if (originalType.equals(OriginalType.DECIMAL) ) { int precision = typeInParquet .asPrimitiveType().getDecimalMetadata().getPrecision(); int scale = typeInParquet.asPrimitiveType().getDecimalMetadata().getScale(); ArrowType decimalType = new ArrowType.Decimal(precision, scale); if (decimalType.equals(typeInTable)) { schemaMisMatch = false; } } if (schemaMisMatch) { throw UserException.schemaChangeError().message("Mixed types "+ fieldInSchema.getType() + " , " + typeInParquet + " is not supported.") .build(logger); } } } }
Example #19
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testLogicalTypesBackwardCompatibleWithConvertedTypes() { ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); MessageType expected = Types.buildMessage() .required(PrimitiveTypeName.BINARY) .as(OriginalType.DECIMAL).precision(9).scale(2) .named("aBinaryDecimal") .named("Message"); List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(expected); // Set logical type field to null to test backward compatibility with files written by older API, // where converted_types are written to the metadata, but logicalType is missing parquetSchema.get(1).setLogicalType(null); MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); assertEquals(expected, schema); }
Example #20
Source File: ParquetTypeHelper.java From dremio-oss with Apache License 2.0 | 5 votes |
public static Optional<Field> toField(final Type parquetField, final SchemaDerivationHelper schemaHelper) { if (parquetField.isPrimitive()) { SchemaPath columnSchemaPath = SchemaPath.getCompoundPath(parquetField.getName()); return Optional.of(createField(columnSchemaPath, parquetField.asPrimitiveType(), parquetField.getOriginalType(), schemaHelper)); } // Handle non-primitive cases final GroupType complexField = (GroupType) parquetField; if (OriginalType.LIST == complexField.getOriginalType()) { GroupType repeatedField = (GroupType) complexField.getFields().get(0); // should have only one child field type if (repeatedField.isPrimitive() || !repeatedField.isRepetition(REPEATED) || repeatedField.asGroupType().getFields().size() != 1) { throw UserException.unsupportedError() .message("Parquet List Type is expected to contain only one sub type. Column '%s' contains %d", parquetField.getName(), complexField.getFieldCount()) .build(); } Optional<Field> subField = toField(repeatedField.getFields().get(0), schemaHelper); return subField.map(sf -> new Field(complexField.getName(), true, new ArrowType.List(), Arrays.asList(new Field[] {sf}))); } final boolean isStructType = complexField.getOriginalType() == null; if (isStructType) { // it is struct return toComplexField(complexField, new ArrowType.Struct(), schemaHelper); } // Unsupported complex type return Optional.empty(); }
Example #21
Source File: UnifiedParquetReader.java From dremio-oss with Apache License 2.0 | 5 votes |
private boolean checkIfDecimalIsVectorizable(Type parquetField, ColumnChunkMetaData metadata) { if (parquetField.asPrimitiveType().getOriginalType() != OriginalType.DECIMAL) { return true; } return context.getOptions().getOption(PlannerSettings.ENABLE_VECTORIZED_PARQUET_DECIMAL); }
Example #22
Source File: Metadata.java From dremio-oss with Apache License 2.0 | 5 votes |
private OriginalType getOriginalType(Type type, String[] path, int depth) { if (type.isPrimitive()) { return type.getOriginalType(); } Type t = ((GroupType) type).getType(path[depth]); return getOriginalType(t, path, depth + 1); }
Example #23
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
private void testUseStatsWithSignedSortOrder(StatsHelper helper) { // override defaults and use stats that were accumulated using signed order Configuration conf = new Configuration(); conf.setBoolean("parquet.strings.signed-min-max.enabled", true); ParquetMetadataConverter converter = new ParquetMetadataConverter(conf); BinaryStatistics stats = new BinaryStatistics(); stats.incrementNumNulls(); stats.updateStats(Binary.fromString("A")); stats.incrementNumNulls(); stats.updateStats(Binary.fromString("z")); stats.incrementNumNulls(); PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY) .as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, helper.toParquetStatistics(stats), binaryType); Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty()); Assert.assertTrue(convertedStats.isNumNullsSet()); Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls()); if (helper == StatsHelper.V1) { assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue()); } else { Assert.assertEquals("Should have correct min (unsigned sort)", Binary.fromString("A"), convertedStats.genericGetMin()); Assert.assertEquals("Should have correct max (unsigned sort)", Binary.fromString("z"), convertedStats.genericGetMax()); } }
Example #24
Source File: ParquetResolverTest.java From pxf with Apache License 2.0 | 5 votes |
@Test public void testGetFields_Primitive_RepeatedString() throws IOException { List<Type> columns = new ArrayList<>(); columns.add(new PrimitiveType(Type.Repetition.REPEATED, PrimitiveTypeName.BINARY, "myString", OriginalType.UTF8)); schema = new MessageType("TestProtobuf.StringArray", columns); context.setMetadata(schema); context.setTupleDescription(getColumnDescriptorsFromSchema(schema)); resolver.initialize(context); List<Group> groups = readParquetFile("proto-repeated-string.parquet", 3, schema); List<OneField> fields; // row 0 fields = assertRow(groups, 0, 1); assertEquals(DataType.TEXT.getOID(), fields.get(0).type); assertEquals("[\"hello\",\"world\"]", fields.get(0).val); // row 1 fields = assertRow(groups, 1, 1); assertEquals(DataType.TEXT.getOID(), fields.get(0).type); assertEquals("[\"good\",\"bye\"]", fields.get(0).val); // row 2 fields = assertRow(groups, 2, 1); assertEquals(DataType.TEXT.getOID(), fields.get(0).type); assertEquals("[\"one\",\"two\",\"three\"]", fields.get(0).val); }
Example #25
Source File: ParquetRecordFilterBuilder.java From pxf with Apache License 2.0 | 5 votes |
private static Integer getIntegerForINT32(OriginalType originalType, OperandNode valueOperand) { if (valueOperand == null) return null; if (originalType == OriginalType.DATE) { // Number of days since epoch LocalDate localDateValue = LocalDate.parse(valueOperand.toString()); LocalDate epoch = LocalDate.ofEpochDay(0); return (int) ChronoUnit.DAYS.between(epoch, localDateValue); } return Integer.parseInt(valueOperand.toString()); }
Example #26
Source File: ParquetTypeVisitor.java From iceberg with Apache License 2.0 | 5 votes |
public static <T> T visit(Type type, ParquetTypeVisitor<T> visitor) { if (type instanceof MessageType) { return visitor.message((MessageType) type, visitFields(type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(type.asPrimitiveType()); } else { // if not a primitive, the typeId must be a group GroupType group = type.asGroupType(); OriginalType annotation = group.getOriginalType(); if (annotation != null) { switch (annotation) { case LIST: return visitList(group, visitor); case MAP: return visitMap(group, visitor); default: } } return visitor.struct(group, visitFields(group, visitor)); } }
Example #27
Source File: TestMetadataReader.java From presto with Apache License 2.0 | 5 votes |
@Test(dataProvider = "allCreatedBy") public void testReadNullStats(Optional<String> fileCreatedBy) { // integer assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT32, "Test column"))) .isInstanceOfSatisfying( IntStatistics.class, columnStatistics -> assertTrue(columnStatistics.isEmpty())); // bigint assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT64, "Test column"))) .isInstanceOfSatisfying( LongStatistics.class, columnStatistics -> assertTrue(columnStatistics.isEmpty())); // varchar assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8))) .isInstanceOfSatisfying( BinaryStatistics.class, columnStatistics -> assertTrue(columnStatistics.isEmpty())); // varbinary assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column"))) .isInstanceOfSatisfying( BinaryStatistics.class, columnStatistics -> assertTrue(columnStatistics.isEmpty())); }
Example #28
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath, boolean showOriginalTypes) { String name = Strings.repeat(".", depth) + type.getName(); Repetition rep = type.getRepetition(); PrimitiveTypeName ptype = type.getPrimitiveTypeName(); out.format("%s: %s %s", name, rep, ptype); if (showOriginalTypes) { OriginalType otype; try { otype = type.getOriginalType(); } catch (Exception e) { otype = null; } if (otype != null) out.format(" O:%s", otype); } else { LogicalTypeAnnotation ltype = type.getLogicalTypeAnnotation(); if (ltype != null) out.format(" L:%s", ltype); } if (container != null) { cpath.add(type.getName()); String[] paths = cpath.toArray(new String[0]); cpath.remove(cpath.size() - 1); ColumnDescriptor desc = container.getColumnDescription(paths); int defl = desc.getMaxDefinitionLevel(); int repl = desc.getMaxRepetitionLevel(); out.format(" R:%d D:%d", repl, defl); } out.println(); }
Example #29
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testSkippedV2Stats() { testSkippedV2Stats( Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named(""), new BigInteger("12345678"), new BigInteger("12345679")); testSkippedV2Stats(Types.optional(PrimitiveTypeName.INT96).named(""), new BigInteger("-75687987"), new BigInteger("45367657")); }
Example #30
Source File: ParquetTableMetadataUtils.java From Bats with Apache License 2.0 | 5 votes |
/** * Converts specified {@link MetadataBase.RowGroupMetadata} into the map of {@link ColumnStatistics} * instances with column names as keys. * * @param tableMetadata the source of column types * @param rowGroupMetadata metadata to convert * @return map with converted row group metadata */ @SuppressWarnings("unchecked") private static Map<SchemaPath, ColumnStatistics> getRowGroupColumnStatistics( MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata) { Map<SchemaPath, ColumnStatistics> columnsStatistics = new HashMap<>(); for (MetadataBase.ColumnMetadata column : rowGroupMetadata.getColumns()) { SchemaPath colPath = SchemaPath.getCompoundPath(column.getName()); Long nulls = column.getNulls(); if (!column.isNumNullsSet() || nulls == null) { nulls = GroupScan.NO_COLUMN_STATS; } PrimitiveType.PrimitiveTypeName primitiveType = getPrimitiveTypeName(tableMetadata, column); OriginalType originalType = getOriginalType(tableMetadata, column); Comparator comparator = getComparator(primitiveType, originalType); Map<StatisticsKind, Object> statistics = new HashMap<>(); statistics.put(ColumnStatisticsKind.MIN_VALUE, getValue(column.getMinValue(), primitiveType, originalType)); statistics.put(ColumnStatisticsKind.MAX_VALUE, getValue(column.getMaxValue(), primitiveType, originalType)); statistics.put(ColumnStatisticsKind.NULLS_COUNT, nulls); columnsStatistics.put(colPath, new ColumnStatisticsImpl(statistics, comparator)); } columnsStatistics.putAll(populateNonInterestingColumnsStats(columnsStatistics.keySet(), tableMetadata)); return columnsStatistics; }