org.apache.parquet.schema.OriginalType Java Examples

The following examples show how to use org.apache.parquet.schema.OriginalType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroSchemaConverter190Int96Avro17.java    From datacollector with Apache License 2.0 7 votes vote down vote up
private OriginalType convertLogicalTypeStr(String logicalType) {
    if (logicalType == null) {
      return null;
    } else if (AvroTypeUtil.LOGICAL_TYPE_DECIMAL.equals(logicalType)) {
      return OriginalType.DECIMAL;
    } else if (AvroTypeUtil.LOGICAL_TYPE_DATE.equals(logicalType)) {
      return OriginalType.DATE;
    } else if (AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS.equals(logicalType)) {
      return OriginalType.TIME_MILLIS;
//    } else if (AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS.equals(logicalType)) {
//      return OriginalType.TIME_MICROS;
    } else if (AvroTypeUtil.LOGICAL_TYPE_TIMESTAMP_MILLIS.equals(logicalType)) {
      return OriginalType.TIMESTAMP_MILLIS;
//    } else if (AvroTypeUtil.LOGICAL_TYPE_TIMESTAMP_MICROS.equals(logicalType)) {
//      return OriginalType.TIMESTAMP_MICROS;
    }
    return null;
  }
 
Example #2
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testV2StatsEqualMinMax() {
  testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""),
      93,
      93);
  testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""),
      -5892,
      -5892);
  testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""),
      234998934,
      234998934);
  testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""),
      -2389943895984985L,
      -2389943895984985L);
  testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""),
      new BigInteger("823749"),
      new BigInteger("823749"));
  testV2StatsEqualMinMax(
      Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7)
          .named(""),
      new BigInteger("-8752832"),
      new BigInteger("-8752832"));
  testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT96).named(""),
      new BigInteger("81032984"),
      new BigInteger("81032984"));
}
 
Example #3
Source File: ParquetRecordWriter.java    From Bats with Apache License 2.0 6 votes vote down vote up
protected PrimitiveType getPrimitiveType(MaterializedField field) {
  MinorType minorType = field.getType().getMinorType();
  String name = field.getName();
  int length = ParquetTypeHelper.getLengthForMinorType(minorType);
  PrimitiveTypeName primitiveTypeName = ParquetTypeHelper.getPrimitiveTypeNameForMinorType(minorType);
  if (Types.isDecimalType(minorType)) {
    primitiveTypeName = logicalTypeForDecimals;
    if (usePrimitiveTypesForDecimals) {
      if (field.getPrecision() <= ParquetTypeHelper.getMaxPrecisionForPrimitiveType(PrimitiveTypeName.INT32)) {
        primitiveTypeName = PrimitiveTypeName.INT32;
      } else if (field.getPrecision() <= ParquetTypeHelper.getMaxPrecisionForPrimitiveType(PrimitiveTypeName.INT64)) {
        primitiveTypeName = PrimitiveTypeName.INT64;
      }
    }

    length = DecimalUtility.getMaxBytesSizeForPrecision(field.getPrecision());
  }

  Repetition repetition = ParquetTypeHelper.getRepetitionForDataMode(field.getDataMode());
  OriginalType originalType = ParquetTypeHelper.getOriginalTypeForMinorType(minorType);
  DecimalMetadata decimalMetadata = ParquetTypeHelper.getDecimalMetadataForField(field);
  return new PrimitiveType(repetition, primitiveTypeName, length, name, originalType, decimalMetadata, null);
}
 
Example #4
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helper) {
  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  BinaryStatistics stats = new BinaryStatistics();
  stats.incrementNumNulls();
  stats.updateStats(Binary.fromString("A"));
  stats.incrementNumNulls();
  stats.updateStats(Binary.fromString("A"));
  stats.incrementNumNulls();

  PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
  Statistics convertedStats = converter.fromParquetStatistics(
      Version.FULL_VERSION,
      ParquetMetadataConverter.toParquetStatistics(stats),
      binaryType);

  Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty());
  Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes());
}
 
Example #5
Source File: ParquetGroupConverter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
Converter groupConverterFromArrowSchema(String fieldName, String groupTypeName, GroupType groupType, Collection<SchemaPath> c) {
  final String nameForChild = getNameForChild(fieldName);
  final Field arrowField = Schema.findField(arrowSchema, groupTypeName);
  final ArrowTypeID arrowTypeType = arrowField.getType().getTypeID();
  final List<Field> arrowChildren = arrowField.getChildren();
  if (arrowTypeType == ArrowTypeID.Union) {
    // if it's a union we will add the children directly to the parent
    return new UnionGroupConverter(columnResolver, fieldName, mutator, getWriterProvider(), groupType, c, options, arrowChildren, nameForChild,
        schemaHelper);
  } else if (arrowTypeType == ArrowTypeID.List) {
    // make sure the parquet schema matches the arrow schema and delegate handling the logical list to defaultGroupConverter()
    Preconditions.checkState(groupType.getOriginalType() == OriginalType.LIST, "parquet schema doesn't match the arrow schema for LIST " + nameForChild);
  }

  return defaultGroupConverter(fieldName, mutator, groupType, c, arrowChildren);
}
 
Example #6
Source File: TestPigSchemaConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testListsOfPrimitive() throws Exception {
  for (Type.Repetition repetition : Type.Repetition.values()) {
    for (Type.Repetition valueRepetition : Type.Repetition.values()) {
      for (PrimitiveType.PrimitiveTypeName primitiveTypeName : PrimitiveType.PrimitiveTypeName.values()) {
        if (primitiveTypeName != PrimitiveType.PrimitiveTypeName.INT96) { // INT96 is NYI
          Types.PrimitiveBuilder<PrimitiveType> value = Types.primitive(primitiveTypeName, valueRepetition);
          if (primitiveTypeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
            value.length(1);
          GroupType type = Types.buildGroup(repetition).addField(value.named("b")).as(OriginalType.LIST).named("a");
          pigSchemaConverter.convertField(type); // no exceptions, please
        }
      }
    }
  }
}
 
Example #7
Source File: TestMetadataReader.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test(dataProvider = "allCreatedBy")
public void testReadStatsBinaryUtf8(Optional<String> fileCreatedBy)
{
    PrimitiveType varchar = new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8);
    Statistics statistics;

    // Stats written by Parquet after https://issues.apache.org/jira/browse/PARQUET-1025
    statistics = new Statistics();
    statistics.setNull_count(13);
    statistics.setMin_value("a".getBytes(UTF_8));
    statistics.setMax_value("é".getBytes(UTF_8));
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.of(statistics), varchar))
            .isInstanceOfSatisfying(BinaryStatistics.class, columnStatistics -> {
                assertEquals(columnStatistics.getNumNulls(), 13);
                assertEquals(columnStatistics.getMin().getBytes(), new byte[] {'a'});
                assertEquals(columnStatistics.getMax().getBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9});
                assertEquals(columnStatistics.getMinBytes(), new byte[] {'a'});
                assertEquals(columnStatistics.getMaxBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9});
                assertEquals(columnStatistics.genericGetMin().getBytes(), new byte[] {'a'});
                assertEquals(columnStatistics.genericGetMax().getBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9});
            });
}
 
Example #8
Source File: TestParquetParser.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testIDs() {
  String message =
      "message Message {\n" +
      "  required binary string (UTF8) = 6;\n" +
      "  required int32 i=1;\n" +
      "  required binary s2= 3;\n" +
      "  required binary s3 =4;\n" +
      "}\n";

  MessageType parsed = parseMessageType(message);
  MessageType expected = buildMessage()
      .required(BINARY).as(OriginalType.UTF8).id(6).named("string")
      .required(INT32).id(1).named("i")
      .required(BINARY).id(3).named("s2")
      .required(BINARY).id(4).named("s3")
      .named("Message");

  assertEquals(expected, parsed);
  MessageType reparsed = parseMessageType(parsed.toString());
  assertEquals(expected, reparsed);
}
 
Example #9
Source File: ExaParquetWriterImpl.java    From hadoop-etl-udfs with MIT License 6 votes vote down vote up
static private List<Type> typeInfoToParquetTypes(final List<ExaParquetTypeInfo> exaParquetTypeInfos) {
    List<Type> types = new ArrayList<>();
    for (ExaParquetTypeInfo exaType: exaParquetTypeInfos) {
        if (exaType.length != 0) {
            types.add(new PrimitiveType(
                    Type.Repetition.valueOf(exaType.typeRepitition),
                    PrimitiveType.PrimitiveTypeName.valueOf(exaType.primitiveTypeName),
                    exaType.length,
                    exaType.name));
        } else {
            types.add(new PrimitiveType(
                    Type.Repetition.valueOf(exaType.typeRepitition),
                    PrimitiveType.PrimitiveTypeName.valueOf(exaType.primitiveTypeName),
                    exaType.name,
                    exaType.originalType == null ? null : OriginalType.valueOf(exaType.originalType)));
        }
    }
    return types;
}
 
Example #10
Source File: AvroSchemaConverter190Int96Avro18.java    From datacollector with Apache License 2.0 6 votes vote down vote up
private OriginalType convertLogicalType(LogicalType logicalType) {
  if (logicalType == null) {
    return null;
  } else if (logicalType instanceof LogicalTypes.Decimal) {
    return OriginalType.DECIMAL;
  } else if (logicalType instanceof LogicalTypes.Date) {
    return OriginalType.DATE;
  } else if (logicalType instanceof LogicalTypes.TimeMillis) {
    return OriginalType.TIME_MILLIS;
  } else if (logicalType instanceof LogicalTypes.TimeMicros) {
    return OriginalType.TIME_MICROS;
  } else if (logicalType instanceof LogicalTypes.TimestampMillis) {
    return OriginalType.TIMESTAMP_MILLIS;
  } else if (logicalType instanceof LogicalTypes.TimestampMicros) {
    return OriginalType.TIMESTAMP_MICROS;
  }
  return null;
}
 
Example #11
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testIgnoreStatsWithSignedSortOrder() {
  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  BinaryStatistics stats = new BinaryStatistics();
  stats.incrementNumNulls();
  stats.updateStats(Binary.fromString("A"));
  stats.incrementNumNulls();
  stats.updateStats(Binary.fromString("z"));
  stats.incrementNumNulls();

  PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY)
      .as(OriginalType.UTF8).named("b");
  Statistics convertedStats = converter.fromParquetStatistics(
      Version.FULL_VERSION,
      StatsHelper.V1.toParquetStatistics(stats),
      binaryType);

  Assert.assertFalse("Stats should not include min/max: " + convertedStats, convertedStats.hasNonNullValue());
  Assert.assertTrue("Stats should have null count: " + convertedStats, convertedStats.isNumNullsSet());
  Assert.assertEquals("Stats should have 3 nulls: " + convertedStats, 3L, convertedStats.getNumNulls());
}
 
Example #12
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testV2OnlyStats() {
  testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""),
      0x7F,
      0x80);
  testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""),
      0x7FFF,
      0x8000);
  testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""),
      0x7FFFFFFF,
      0x80000000);
  testV2OnlyStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""),
      0x7FFFFFFFFFFFFFFFL,
      0x8000000000000000L);
  testV2OnlyStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""),
      new BigInteger("-765875"),
      new BigInteger("876856"));
  testV2OnlyStats(
      Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7)
          .named(""),
      new BigInteger("-6769643"),
      new BigInteger("9864675"));
}
 
Example #13
Source File: AvroSchemaConverter190Int96Avro17.java    From datacollector with Apache License 2.0 6 votes vote down vote up
private Schema addLogicalTypeStrToSchema(
    Schema schema,
    OriginalType annotation,
    PrimitiveType asPrimitive,
    PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName
) {
  Map<String, String> logicalType = convertOriginalTypeToMap(annotation, asPrimitive.getDecimalMetadata());
  if (logicalType != null && (annotation != DECIMAL ||
      parquetPrimitiveTypeName == BINARY ||
      parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) {
    for(Map.Entry<String, String> entry : logicalType.entrySet()) {
      schema.addProp(entry.getKey(), entry.getValue());
    }
  }

  return schema;
}
 
Example #14
Source File: ParquetTableMetadataUtils.java    From Bats with Apache License 2.0 6 votes vote down vote up
/**
 * Populates the non-interesting column's statistics
 * @param schemaPaths columns paths which should be ignored
 * @param parquetTableMetadata the source of column metadata for non-interesting column's statistics
 * @return returns non-interesting column statistics map
 */
@SuppressWarnings("unchecked")
public static Map<SchemaPath, ColumnStatistics> populateNonInterestingColumnsStats(
        Set<SchemaPath> schemaPaths, MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
  Map<SchemaPath, ColumnStatistics> columnsStatistics = new HashMap<>();
  if (parquetTableMetadata instanceof Metadata_V4.ParquetTableMetadata_v4) {
    for (Metadata_V4.ColumnTypeMetadata_v4 columnTypeMetadata :
        ((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).getColumnTypeInfoMap().values()) {
      SchemaPath schemaPath = SchemaPath.getCompoundPath(columnTypeMetadata.name);
      if (!schemaPaths.contains(schemaPath)) {
        Map<StatisticsKind, Object> statistics = new HashMap<>();
        statistics.put(ColumnStatisticsKind.NULLS_COUNT, GroupScan.NO_COLUMN_STATS);
        PrimitiveType.PrimitiveTypeName primitiveType = columnTypeMetadata.primitiveType;
        OriginalType originalType = columnTypeMetadata.originalType;
        Comparator comparator = getComparator(primitiveType, originalType);
        columnsStatistics.put(schemaPath, new ColumnStatisticsImpl<>(statistics, comparator));
      }
    }
  }
  return columnsStatistics;
}
 
Example #15
Source File: MapKeyValuesSchemaConverter.java    From presto with Apache License 2.0 5 votes vote down vote up
private static GroupType listWrapper(Repetition repetition, String alias, OriginalType originalType, Type nested)
{
    if (!nested.isRepetition(Repetition.REPEATED)) {
        throw new IllegalArgumentException("Nested type should be repeated: " + nested);
    }
    return new GroupType(repetition, alias, originalType, nested);
}
 
Example #16
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 5 votes vote down vote up
private void testSetFields_RightTrimCharHelper(String varchar, String inputChar, String expectedChar) throws IOException {
    List<Type> typeFields = new ArrayList<>();
    typeFields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "vc1", OriginalType.UTF8));
    typeFields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "c1", OriginalType.UTF8));
    schema = new MessageType("hive_schema", typeFields);
    context.setMetadata(schema);

    List<ColumnDescriptor> columnDescriptors = new ArrayList<>();
    columnDescriptors.add(new ColumnDescriptor("vc1", DataType.VARCHAR.getOID(), 0, "varchar", null));
    columnDescriptors.add(new ColumnDescriptor("c1", DataType.BPCHAR.getOID(), 1, "char", null));
    context.setTupleDescription(columnDescriptors);

    resolver.initialize(context);

    List<OneField> fields = new ArrayList<>();
    fields.add(new OneField(DataType.TEXT.getOID(), varchar));
    // the whitespace on the after 'abc   ' needs to be trimmed
    fields.add(new OneField(DataType.TEXT.getOID(), inputChar));

    OneRow row = resolver.setFields(fields);
    assertNotNull(row);
    Object data = row.getData();
    assertNotNull(data);
    assertTrue(data instanceof Group);
    Group group = (Group) data;

    // assert column values
    assertEquals(varchar, group.getString(0, 0));
    assertEquals(expectedChar, group.getString(1, 0));

    // assert value repetition count
    for (int i = 0; i < 2; i++) {
        assertEquals(1, group.getFieldRepetitionCount(i));
    }
}
 
Example #17
Source File: ParquetGroupConverter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
Converter defaultGroupConverter(String fieldName, OutputMutator mutator, GroupType groupType,
                                Collection<SchemaPath> c, List<Field> arrowSchema) {

  if (groupType.getOriginalType() == OriginalType.LIST && LogicalListL1Converter.isSupportedSchema(groupType)) {
    return new LogicalListL1Converter(
      columnResolver,
      fieldName,
      mutator,
      getWriterProvider(),
      groupType,
      c,
      options,
      arrowSchema,
      schemaHelper
    );
  }

  final String nameForChild = getNameForChild(columnResolver.getBatchSchemaColumnName(fieldName));
  final StructWriter struct;
  if (groupType.isRepetition(REPEATED)) {
    if (arrowSchema != null) {
      //TODO assert this should never occur at this level
      // only parquet writer that writes arrowSchema doesn't write repeated fields except
      // as part of a LOGICAL LIST, thus this scenario (repeated + arrow schema present) can
      // only happen in LogicalList converter
      arrowSchema = handleRepeatedField(arrowSchema, groupType);
    }
    struct = list(nameForChild).struct();
  } else {
    struct = getWriterProvider().struct(nameForChild);
  }

  return new StructGroupConverter(columnResolver, fieldName, mutator, struct, groupType, c, options, arrowSchema, schemaHelper);
}
 
Example #18
Source File: ParquetRowiseReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void verifyDecimalTypesAreSame(OutputMutator output, ParquetColumnResolver columnResolver) {
  for (ValueVector vector : output.getVectors()) {
    Field fieldInSchema = vector.getField();
    if (fieldInSchema.getType().getTypeID() == ArrowType.ArrowTypeID.Decimal) {
      ArrowType.Decimal typeInTable = (ArrowType.Decimal) fieldInSchema.getType();
      Type typeInParquet = null;
      // the field in arrow schema may not be present in hive schema
      try {
        typeInParquet  = schema.getType(columnResolver.getParquetColumnName(fieldInSchema.getName()));
      } catch (InvalidRecordException e) {
      }
      if (typeInParquet == null) {
        continue;
      }
      boolean schemaMisMatch = true;
      OriginalType originalType = typeInParquet.getOriginalType();
      if (originalType.equals(OriginalType.DECIMAL) ) {
        int precision = typeInParquet
          .asPrimitiveType().getDecimalMetadata().getPrecision();
        int scale = typeInParquet.asPrimitiveType().getDecimalMetadata().getScale();
        ArrowType decimalType = new ArrowType.Decimal(precision, scale);
        if (decimalType.equals(typeInTable)) {
          schemaMisMatch = false;
        }
      }
      if (schemaMisMatch) {
        throw UserException.schemaChangeError().message("Mixed types "+ fieldInSchema.getType()
          + " , " + typeInParquet + " is not supported.")
          .build(logger);
      }
    }
  }
}
 
Example #19
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogicalTypesBackwardCompatibleWithConvertedTypes() {
  ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
  MessageType expected = Types.buildMessage()
    .required(PrimitiveTypeName.BINARY)
    .as(OriginalType.DECIMAL).precision(9).scale(2)
    .named("aBinaryDecimal")
    .named("Message");
  List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
  // Set logical type field to null to test backward compatibility with files written by older API,
  // where converted_types are written to the metadata, but logicalType is missing
  parquetSchema.get(1).setLogicalType(null);
  MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
  assertEquals(expected, schema);
}
 
Example #20
Source File: ParquetTypeHelper.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static Optional<Field> toField(final Type parquetField, final SchemaDerivationHelper schemaHelper) {
  if (parquetField.isPrimitive()) {
    SchemaPath columnSchemaPath = SchemaPath.getCompoundPath(parquetField.getName());
    return Optional.of(createField(columnSchemaPath, parquetField.asPrimitiveType(), parquetField.getOriginalType(), schemaHelper));
  }

  // Handle non-primitive cases
  final GroupType complexField = (GroupType) parquetField;
  if (OriginalType.LIST == complexField.getOriginalType()) {
    GroupType repeatedField = (GroupType) complexField.getFields().get(0);

    // should have only one child field type
    if (repeatedField.isPrimitive() || !repeatedField.isRepetition(REPEATED) || repeatedField.asGroupType().getFields().size() != 1) {
      throw UserException.unsupportedError()
        .message("Parquet List Type is expected to contain only one sub type. Column '%s' contains %d", parquetField.getName(), complexField.getFieldCount())
        .build();
    }

    Optional<Field> subField = toField(repeatedField.getFields().get(0), schemaHelper);
    return subField.map(sf -> new Field(complexField.getName(), true, new ArrowType.List(), Arrays.asList(new Field[] {sf})));
  }

  final boolean isStructType = complexField.getOriginalType() == null;
  if (isStructType) { // it is struct
    return toComplexField(complexField, new ArrowType.Struct(), schemaHelper);
  }

  // Unsupported complex type
  return Optional.empty();
}
 
Example #21
Source File: UnifiedParquetReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private boolean checkIfDecimalIsVectorizable(Type parquetField, ColumnChunkMetaData metadata) {
  if (parquetField.asPrimitiveType().getOriginalType() != OriginalType.DECIMAL) {
    return true;
  }

  return context.getOptions().getOption(PlannerSettings.ENABLE_VECTORIZED_PARQUET_DECIMAL);
}
 
Example #22
Source File: Metadata.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private OriginalType getOriginalType(Type type, String[] path, int depth) {
  if (type.isPrimitive()) {
    return type.getOriginalType();
  }
  Type t = ((GroupType) type).getType(path[depth]);
  return getOriginalType(t, path, depth + 1);
}
 
Example #23
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testUseStatsWithSignedSortOrder(StatsHelper helper) {
  // override defaults and use stats that were accumulated using signed order
  Configuration conf = new Configuration();
  conf.setBoolean("parquet.strings.signed-min-max.enabled", true);

  ParquetMetadataConverter converter = new ParquetMetadataConverter(conf);
  BinaryStatistics stats = new BinaryStatistics();
  stats.incrementNumNulls();
  stats.updateStats(Binary.fromString("A"));
  stats.incrementNumNulls();
  stats.updateStats(Binary.fromString("z"));
  stats.incrementNumNulls();

  PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY)
      .as(OriginalType.UTF8).named("b");
  Statistics convertedStats = converter.fromParquetStatistics(
      Version.FULL_VERSION,
      helper.toParquetStatistics(stats),
      binaryType);

  Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
  Assert.assertTrue(convertedStats.isNumNullsSet());
  Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
  if (helper == StatsHelper.V1) {
    assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue());
  } else {
    Assert.assertEquals("Should have correct min (unsigned sort)",
        Binary.fromString("A"), convertedStats.genericGetMin());
    Assert.assertEquals("Should have correct max (unsigned sort)",
        Binary.fromString("z"), convertedStats.genericGetMax());
  }
}
 
Example #24
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetFields_Primitive_RepeatedString() throws IOException {
    List<Type> columns = new ArrayList<>();
    columns.add(new PrimitiveType(Type.Repetition.REPEATED, PrimitiveTypeName.BINARY, "myString", OriginalType.UTF8));
    schema = new MessageType("TestProtobuf.StringArray", columns);
    context.setMetadata(schema);
    context.setTupleDescription(getColumnDescriptorsFromSchema(schema));
    resolver.initialize(context);

    List<Group> groups = readParquetFile("proto-repeated-string.parquet", 3, schema);
    List<OneField> fields;

    // row 0
    fields = assertRow(groups, 0, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"hello\",\"world\"]", fields.get(0).val);

    // row 1
    fields = assertRow(groups, 1, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"good\",\"bye\"]", fields.get(0).val);

    // row 2
    fields = assertRow(groups, 2, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"one\",\"two\",\"three\"]", fields.get(0).val);

}
 
Example #25
Source File: ParquetRecordFilterBuilder.java    From pxf with Apache License 2.0 5 votes vote down vote up
private static Integer getIntegerForINT32(OriginalType originalType, OperandNode valueOperand) {
    if (valueOperand == null) return null;
    if (originalType == OriginalType.DATE) {
        // Number of days since epoch
        LocalDate localDateValue = LocalDate.parse(valueOperand.toString());
        LocalDate epoch = LocalDate.ofEpochDay(0);
        return (int) ChronoUnit.DAYS.between(epoch, localDateValue);
    }
    return Integer.parseInt(valueOperand.toString());
}
 
Example #26
Source File: ParquetTypeVisitor.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static <T> T visit(Type type, ParquetTypeVisitor<T> visitor) {
  if (type instanceof MessageType) {
    return visitor.message((MessageType) type,
        visitFields(type.asGroupType(), visitor));

  } else if (type.isPrimitive()) {
    return visitor.primitive(type.asPrimitiveType());

  } else {
    // if not a primitive, the typeId must be a group
    GroupType group = type.asGroupType();
    OriginalType annotation = group.getOriginalType();
    if (annotation != null) {
      switch (annotation) {
        case LIST:
          return visitList(group, visitor);

        case MAP:
          return visitMap(group, visitor);

        default:
      }
    }

    return visitor.struct(group, visitFields(group, visitor));
  }
}
 
Example #27
Source File: TestMetadataReader.java    From presto with Apache License 2.0 5 votes vote down vote up
@Test(dataProvider = "allCreatedBy")
public void testReadNullStats(Optional<String> fileCreatedBy)
{
    // integer
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT32, "Test column")))
            .isInstanceOfSatisfying(
                    IntStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // bigint
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT64, "Test column")))
            .isInstanceOfSatisfying(
                    LongStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // varchar
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8)))
            .isInstanceOfSatisfying(
                    BinaryStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // varbinary
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column")))
            .isInstanceOfSatisfying(
                    BinaryStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));
}
 
Example #28
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath, boolean showOriginalTypes) {
  String name = Strings.repeat(".", depth) + type.getName();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (showOriginalTypes) {
    OriginalType otype;
    try {
      otype = type.getOriginalType();
    } catch (Exception e) {
      otype = null;
    }
    if (otype != null) out.format(" O:%s", otype);
  } else {
    LogicalTypeAnnotation ltype = type.getLogicalTypeAnnotation();
    if (ltype != null) out.format(" L:%s", ltype);
  }

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[0]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
Example #29
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSkippedV2Stats() {
  testSkippedV2Stats(
      Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named(""),
      new BigInteger("12345678"),
      new BigInteger("12345679"));
  testSkippedV2Stats(Types.optional(PrimitiveTypeName.INT96).named(""),
      new BigInteger("-75687987"),
      new BigInteger("45367657"));
}
 
Example #30
Source File: ParquetTableMetadataUtils.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * Converts specified {@link MetadataBase.RowGroupMetadata} into the map of {@link ColumnStatistics}
 * instances with column names as keys.
 *
 * @param tableMetadata    the source of column types
 * @param rowGroupMetadata metadata to convert
 * @return map with converted row group metadata
 */
@SuppressWarnings("unchecked")
private static Map<SchemaPath, ColumnStatistics> getRowGroupColumnStatistics(
    MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata) {

  Map<SchemaPath, ColumnStatistics> columnsStatistics = new HashMap<>();

  for (MetadataBase.ColumnMetadata column : rowGroupMetadata.getColumns()) {
    SchemaPath colPath = SchemaPath.getCompoundPath(column.getName());

    Long nulls = column.getNulls();
    if (!column.isNumNullsSet() || nulls == null) {
      nulls = GroupScan.NO_COLUMN_STATS;
    }
    PrimitiveType.PrimitiveTypeName primitiveType = getPrimitiveTypeName(tableMetadata, column);
    OriginalType originalType = getOriginalType(tableMetadata, column);
    Comparator comparator = getComparator(primitiveType, originalType);

    Map<StatisticsKind, Object> statistics = new HashMap<>();
    statistics.put(ColumnStatisticsKind.MIN_VALUE, getValue(column.getMinValue(), primitiveType, originalType));
    statistics.put(ColumnStatisticsKind.MAX_VALUE, getValue(column.getMaxValue(), primitiveType, originalType));
    statistics.put(ColumnStatisticsKind.NULLS_COUNT, nulls);
    columnsStatistics.put(colPath, new ColumnStatisticsImpl(statistics, comparator));
  }
  columnsStatistics.putAll(populateNonInterestingColumnsStats(columnsStatistics.keySet(), tableMetadata));
  return columnsStatistics;
}