Java Code Examples for org.apache.parquet.schema.GroupType#getType()

The following examples show how to use org.apache.parquet.schema.GroupType#getType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestDataWritableWriter.java    From presto with Apache License 2.0 6 votes vote down vote up
private void writeSingleLevelArray(Object value, ListObjectInspector inspector, GroupType type)
{
    // Get the internal array structure
    Type elementType = type.getType(0);

    recordConsumer.startGroup();

    List<?> arrayValues = inspector.getList(value);
    if (!arrayValues.isEmpty()) {
        recordConsumer.startField(elementType.getName(), 0);
        ObjectInspector elementInspector = inspector.getListElementObjectInspector();

        for (Object element : arrayValues) {
            if (element == null) {
                throw new IllegalArgumentException("Array elements are requires in given schema definition");
            }
            writeValue(element, elementInspector, elementType);
        }

        recordConsumer.endField(elementType.getName(), 0);
    }
    recordConsumer.endGroup();
}
 
Example 2
Source File: AvroWriteSupport.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
protected void writeCollection(GroupType type, Schema schema, Collection<?> collection) {
  if (collection.size() > 0) {
    recordConsumer.startField(LIST_REPEATED_NAME, 0);
    GroupType repeatedType = type.getType(0).asGroupType();
    Type elementType = repeatedType.getType(0);
    for (Object element : collection) {
      recordConsumer.startGroup(); // repeated group array, middle layer
      if (element != null) {
        recordConsumer.startField(LIST_ELEMENT_NAME, 0);
        writeValue(elementType, schema.getElementType(), element);
        recordConsumer.endField(LIST_ELEMENT_NAME, 0);
      } else if (!elementType.isRepetition(Type.Repetition.OPTIONAL)) {
        throw new RuntimeException(
            "Null list element for " + schema.getName());
      }
      recordConsumer.endGroup();
    }
    recordConsumer.endField(LIST_REPEATED_NAME, 0);
  }
}
 
Example 3
Source File: ColumnIOFactory.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void visitChildren(GroupColumnIO newIO, GroupType groupType, GroupType requestedGroupType) {
  GroupColumnIO oldIO = current;
  current = newIO;
  for (Type type : groupType.getFields()) {
    // if the file schema does not contain the field it will just stay null
    if (requestedGroupType.containsField(type.getName())) {
      currentRequestedIndex = requestedGroupType.getFieldIndex(type.getName());
      currentRequestedType = requestedGroupType.getType(currentRequestedIndex);
      if (currentRequestedType.getRepetition().isMoreRestrictiveThan(type.getRepetition())) {
        incompatibleSchema(type, currentRequestedType);
      }
      type.accept(this);
    }
  }
  current = oldIO;
}
 
Example 4
Source File: TestDataWritableWriter.java    From presto with Apache License 2.0 6 votes vote down vote up
/**
 * It writes all the fields contained inside a group to the RecordConsumer.
 *
 * @param value The list of values contained in the group.
 * @param inspector The object inspector used to get the correct value type.
 * @param type Type that contains information about the group schema.
 */
private void writeGroupFields(Object value, StructObjectInspector inspector, GroupType type)
{
    if (value != null) {
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        List<Object> fieldValuesList = inspector.getStructFieldsDataAsList(value);

        for (int i = 0; i < type.getFieldCount(); i++) {
            Type fieldType = type.getType(i);
            String fieldName = fieldType.getName();
            Object fieldValue = fieldValuesList.get(i);

            if (fieldValue != null) {
                ObjectInspector fieldInspector = fields.get(i).getFieldObjectInspector();
                recordConsumer.startField(fieldName, i);
                writeValue(fieldValue, fieldInspector, fieldType);
                recordConsumer.endField(fieldName, i);
            }
        }
    }
}
 
Example 5
Source File: PigParquetReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetValueReader<?> list(
    Types.ListType expectedList, GroupType array, ParquetValueReader<?> elementReader) {
  GroupType repeated = array.getFields().get(0).asGroupType();
  String[] repeatedPath = currentPath();

  int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1;
  int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1;

  Type elementType = repeated.getType(0);
  int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1;

  return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader));
}
 
Example 6
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public TupleConverter(GroupType parquetSchema) {
  int schemaSize = parquetSchema.getFieldCount();

  this.converters = new Converter[schemaSize];
  for (int i = 0; i < schemaSize; i++) {
    Type type = parquetSchema.getType(i);
    converters[i] = newConverter(type, i);
  }
}
 
Example 7
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
MapConverter(List<TProtocol> parentEvents, GroupType parquetSchema, ThriftField field) {
  this.parentEvents = parentEvents;
  if (parquetSchema.getFieldCount() != 1) {
    throw new IllegalArgumentException("maps have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount());
  }
  Type nestedType = parquetSchema.getType(0);
  final ThriftField key = ((MapType)field.getType()).getKey();
  keyType = key.getType().getType().getThriftType();
  final ThriftField value = ((MapType)field.getType()).getValue();
  valueType = value.getType().getType().getThriftType();
  child = new GroupCounter(new MapKeyValueConverter(mapEvents, nestedType, key, value));
}
 
Example 8
Source File: PigSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Type filterBag(GroupType bagType, FieldSchema bagFieldSchema) throws FrontendException {
  if (LOG.isDebugEnabled()) LOG.debug("filtering BAG schema:\n" + bagType + "\nwith:\n " + bagFieldSchema);
  if (bagType.getFieldCount() != 1) {
    throw new RuntimeException("not unwrapping the right type, this should be a Bag: " + bagType);
  }
  Type nested = bagType.getType(0);
  FieldSchema innerField = bagFieldSchema.schema.getField(0);
  if (nested.isPrimitive() || nested.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation
    || nested.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) {
    // Bags always contain tuples => we skip the extra tuple that was inserted in that case.
    innerField = innerField.schema.getField(0);
  }
  return bagType.withNewFields(filter(nested, innerField));
}
 
Example 9
Source File: GenericParquetReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetValueReader<?> list(Types.ListType expectedList, GroupType array,
                                  ParquetValueReader<?> elementReader) {
  GroupType repeated = array.getFields().get(0).asGroupType();
  String[] repeatedPath = currentPath();

  int repeatedD = type.getMaxDefinitionLevel(repeatedPath)-1;
  int repeatedR = type.getMaxRepetitionLevel(repeatedPath)-1;

  Type elementType = repeated.getType(0);
  int elementD = type.getMaxDefinitionLevel(path(elementType.getName()))-1;

  return new ListReader<>(repeatedD, repeatedR, option(elementType, elementD, elementReader));
}
 
Example 10
Source File: DataWritableWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void writeData(final ArrayWritable arr, final GroupType type) {
  if (arr == null) {
    return;
  }
  final int fieldCount = type.getFieldCount();
  Writable[] values = arr.get();
  for (int field = 0; field < fieldCount; ++field) {
    final Type fieldType = type.getType(field);
    final String fieldName = fieldType.getName();
    final Writable value = values[field];
    if (value == null) {
      continue;
    }
    recordConsumer.startField(fieldName, field);

    if (fieldType.isPrimitive()) {
      writePrimitive(value);
    } else {
      recordConsumer.startGroup();
      if (value instanceof ArrayWritable) {
        if (fieldType.asGroupType().getRepetition().equals(Type.Repetition.REPEATED)) {
          writeArray((ArrayWritable) value, fieldType.asGroupType());
        } else {
          writeData((ArrayWritable) value, fieldType.asGroupType());
        }
      } else if (value != null) {
        throw new ParquetEncodingException("This should be an ArrayWritable or MapWritable: " + value);
      }

      recordConsumer.endGroup();
    }

    recordConsumer.endField(fieldName, field);
  }
}
 
Example 11
Source File: ParquetTypeVisitor.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static <T> T visitList(GroupType list, ParquetTypeVisitor<T> visitor) {
  Preconditions.checkArgument(!list.isRepetition(Type.Repetition.REPEATED),
      "Invalid list: top-level group is repeated: %s", list);
  Preconditions.checkArgument(list.getFieldCount() == 1,
      "Invalid list: does not contain single repeated field: %s", list);

  GroupType repeatedElement = list.getFields().get(0).asGroupType();
  Preconditions.checkArgument(repeatedElement.isRepetition(Type.Repetition.REPEATED),
      "Invalid list: inner group is not repeated");
  Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1,
      "Invalid list: repeated group is not a single field: %s", list);

  visitor.beforeRepeatedElement(repeatedElement);
  try {
    T elementResult = null;
    if (repeatedElement.getFieldCount() > 0) {
      Type elementField = repeatedElement.getType(0);
      visitor.beforeElementField(elementField);
      try {
        elementResult = visit(elementField, visitor);
      } finally {
        visitor.afterElementField(elementField);
      }
    }

    return visitor.list(list, elementResult);

  } finally {
    visitor.afterRepeatedElement(repeatedElement);
  }
}
 
Example 12
Source File: ParquetAvroWriter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetValueWriter<?> list(GroupType array, ParquetValueWriter<?> elementWriter) {
  GroupType repeated = array.getFields().get(0).asGroupType();
  String[] repeatedPath = currentPath();

  int repeatedD = type.getMaxDefinitionLevel(repeatedPath);
  int repeatedR = type.getMaxRepetitionLevel(repeatedPath);

  org.apache.parquet.schema.Type elementType = repeated.getType(0);
  int elementD = type.getMaxDefinitionLevel(path(elementType.getName()));

  return collections(repeatedD, repeatedR, option(elementType, elementD, elementWriter));
}
 
Example 13
Source File: AvroRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ElementConverter(GroupType repeatedType, Schema elementSchema, GenericData model) {
  Type elementType = repeatedType.getType(0);
  Schema nonNullElementSchema = AvroSchemaConverter.getNonNull(elementSchema);
  this.elementConverter = newConverter(nonNullElementSchema, elementType, model, new ParentValueContainer() {
    @Override
    @SuppressWarnings("unchecked")
    public void add(Object value) {
      ElementConverter.this.element = value;
    }
  });
}
 
Example 14
Source File: GenericParquetWriter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetValueWriter<?> list(GroupType array, ParquetValueWriter<?> elementWriter) {
  GroupType repeated = array.getFields().get(0).asGroupType();
  String[] repeatedPath = currentPath();

  int repeatedD = type.getMaxDefinitionLevel(repeatedPath);
  int repeatedR = type.getMaxRepetitionLevel(repeatedPath);

  org.apache.parquet.schema.Type elementType = repeated.getType(0);
  int elementD = type.getMaxDefinitionLevel(path(elementType.getName()));

  return ParquetValueWriters.collections(repeatedD, repeatedR,
      ParquetValueWriters.option(elementType, elementD, elementWriter));
}
 
Example 15
Source File: ParquetAvroValueReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetValueReader<?> list(Types.ListType expectedList, GroupType array,
                                  ParquetValueReader<?> elementReader) {
  GroupType repeated = array.getFields().get(0).asGroupType();
  String[] repeatedPath = currentPath();

  int repeatedD = type.getMaxDefinitionLevel(repeatedPath)-1;
  int repeatedR = type.getMaxRepetitionLevel(repeatedPath)-1;

  Type elementType = repeated.getType(0);
  int elementD = type.getMaxDefinitionLevel(path(elementType.getName()))-1;

  return new ListReader<>(repeatedD, repeatedR, option(elementType, elementD, elementReader));
}
 
Example 16
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
BagConverter(GroupType parquetSchema, FieldSchema pigSchema, ParentValueContainer parent, boolean numbersDefaultToZero, boolean columnIndexAccess) throws FrontendException {
  this.parent = parent;
  if (parquetSchema.getFieldCount() != 1) {
    throw new IllegalArgumentException("bags have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount());
  }
  Type nestedType = parquetSchema.getType(0);

  ParentValueContainer childsParent;
  FieldSchema pigField;
  if (nestedType.isPrimitive() || nestedType.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation
    || nestedType.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) {
    // Pig bags always contain tuples
    // In that case we need to wrap the value in an extra tuple
    childsParent = new ParentValueContainer() {
      @Override
      void add(Object value) {
        buffer.add(TF.newTuple(value));
      }};
    pigField = pigSchema.schema.getField(0).schema.getField(0);
  } else {
    childsParent = new ParentValueContainer() {
      @Override
      void add(Object value) {
        buffer.add((Tuple)value);
      }};
    pigField = pigSchema.schema.getField(0);
  }
  child = newConverter(pigField, nestedType, childsParent, numbersDefaultToZero, columnIndexAccess);
}
 
Example 17
Source File: AvroWriteSupportInt96Avro17.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private <V> void writeMap(GroupType schema, Schema avroSchema,
    Map<CharSequence, V> map) {
  GroupType innerGroup = schema.getType(0).asGroupType();
  Type keyType = innerGroup.getType(0);
  Type valueType = innerGroup.getType(1);

  recordConsumer.startGroup(); // group wrapper (original type MAP)
  if (map.size() > 0) {
    recordConsumer.startField(MAP_REPEATED_NAME, 0);

    for (Map.Entry<CharSequence, V> entry : map.entrySet()) {
      recordConsumer.startGroup(); // repeated group key_value, middle layer
      recordConsumer.startField(MAP_KEY_NAME, 0);
      writeValue(keyType, MAP_KEY_SCHEMA, entry.getKey());
      recordConsumer.endField(MAP_KEY_NAME, 0);
      V value = entry.getValue();
      if (value != null) {
        recordConsumer.startField(MAP_VALUE_NAME, 1);
        writeValue(valueType, avroSchema.getValueType(), value);
        recordConsumer.endField(MAP_VALUE_NAME, 1);
      } else if (!valueType.isRepetition(Type.Repetition.OPTIONAL)) {
        throw new RuntimeException("Null map value for " + avroSchema.getName());
      }
      recordConsumer.endGroup();
    }

    recordConsumer.endField(MAP_REPEATED_NAME, 0);
  }
  recordConsumer.endGroup();
}
 
Example 18
Source File: AvroWriteSupportInt96Avro18.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private <V> void writeMap(GroupType schema, Schema avroSchema,
    Map<CharSequence, V> map) {
  GroupType innerGroup = schema.getType(0).asGroupType();
  Type keyType = innerGroup.getType(0);
  Type valueType = innerGroup.getType(1);

  recordConsumer.startGroup(); // group wrapper (original type MAP)
  if (map.size() > 0) {
    recordConsumer.startField(MAP_REPEATED_NAME, 0);

    for (Map.Entry<CharSequence, V> entry : map.entrySet()) {
      recordConsumer.startGroup(); // repeated group key_value, middle layer
      recordConsumer.startField(MAP_KEY_NAME, 0);
      writeValue(keyType, MAP_KEY_SCHEMA, entry.getKey());
      recordConsumer.endField(MAP_KEY_NAME, 0);
      V value = entry.getValue();
      if (value != null) {
        recordConsumer.startField(MAP_VALUE_NAME, 1);
        writeValue(valueType, avroSchema.getValueType(), value);
        recordConsumer.endField(MAP_VALUE_NAME, 1);
      } else if (!valueType.isRepetition(Type.Repetition.OPTIONAL)) {
        throw new RuntimeException("Null map value for " + avroSchema.getName());
      }
      recordConsumer.endGroup();
    }

    recordConsumer.endField(MAP_REPEATED_NAME, 0);
  }
  recordConsumer.endGroup();
}
 
Example 19
Source File: ParquetAsJsonInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
private void groupToJson(JsonGenerator currentGenerator, SimpleGroup grp)
      throws IOException {

    GroupType gt = grp.getType();

    currentGenerator.writeStartObject();
    for(int i = 0; i < gt.getFieldCount(); i ++) {

        String field = gt.getFieldName(i);
        try {
            Type t = gt.getType(i);
            int repetition = 1;
            boolean repeated = false;
            if (t.getRepetition() == Type.Repetition.REPEATED) {
                repeated = true;
                repetition = grp.getFieldRepetitionCount(i);
                currentGenerator.writeArrayFieldStart(field);
            }
            else
                currentGenerator.writeFieldName(field);

            for(int j = 0; j < repetition; j ++) {

                if (t.isPrimitive()) {
                    switch (t.asPrimitiveType().getPrimitiveTypeName()) {
                        case BINARY:
                            currentGenerator.writeString(grp.getString(i, j));
                            break;
                        case INT32:
                            currentGenerator.writeNumber(grp.getInteger(i, j));
                            break;
                        case INT96:
                        case INT64:
                            // clumsy way - TODO - Subclass SimpleGroup or something like that
                            currentGenerator.writeNumber(Long.parseLong(grp.getValueToString(i, j)));
                            break;
                        case DOUBLE:
                        case FLOAT:
                            currentGenerator.writeNumber(Double.parseDouble(grp.getValueToString(i, j)));
                            break;
                        case BOOLEAN:
                            currentGenerator.writeBoolean(grp.getBoolean(i, j));
                            break;
                        default:
                            throw new RuntimeException("Can't handle type " + gt.getType(i));
                    }
                } else {
                    groupToJson(currentGenerator, (SimpleGroup) grp.getGroup(i, j));
                }
            }

            if (repeated)
                currentGenerator.writeEndArray();
        }
        catch (Exception e) {
            if (e.getMessage().startsWith("not found") && gt.getType(i).getRepetition() == Type.Repetition.OPTIONAL)
                currentGenerator.writeNull();
            else
                 throw new RuntimeException(e);
        }
    }
    currentGenerator.writeEndObject();
}
 
Example 20
Source File: ParquetTypeVisitor.java    From iceberg with Apache License 2.0 4 votes vote down vote up
public static <T> T visit(Type type, ParquetTypeVisitor<T> visitor) {
  if (type instanceof MessageType) {
    return visitor.message((MessageType) type,
        visitFields(type.asGroupType(), visitor));

  } else if (type.isPrimitive()) {
    return visitor.primitive(type.asPrimitiveType());

  } else {
    // if not a primitive, the typeId must be a group
    GroupType group = type.asGroupType();
    OriginalType annotation = group.getOriginalType();
    if (annotation != null) {
      switch (annotation) {
        case LIST:
          Preconditions.checkArgument(!group.isRepetition(REPEATED),
              "Invalid list: top-level group is repeated: " + group);
          Preconditions.checkArgument(group.getFieldCount() == 1,
              "Invalid list: does not contain single repeated field: " + group);

          GroupType repeatedElement = group.getFields().get(0).asGroupType();
          Preconditions.checkArgument(repeatedElement.isRepetition(REPEATED),
              "Invalid list: inner group is not repeated");
          Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1,
              "Invalid list: repeated group is not a single field: " + group);

          visitor.fieldNames.push(repeatedElement.getName());
          try {
            T elementResult = null;
            if (repeatedElement.getFieldCount() > 0) {
              elementResult = visitField(repeatedElement.getType(0), visitor);
            }

            return visitor.list(group, elementResult);

          } finally {
            visitor.fieldNames.pop();
          }

        case MAP:
          Preconditions.checkArgument(!group.isRepetition(REPEATED),
              "Invalid map: top-level group is repeated: " + group);
          Preconditions.checkArgument(group.getFieldCount() == 1,
              "Invalid map: does not contain single repeated field: " + group);

          GroupType repeatedKeyValue = group.getType(0).asGroupType();
          Preconditions.checkArgument(repeatedKeyValue.isRepetition(REPEATED),
              "Invalid map: inner group is not repeated");
          Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2,
              "Invalid map: repeated group does not have 2 fields");

          visitor.fieldNames.push(repeatedKeyValue.getName());
          try {
            T keyResult = null;
            T valueResult = null;
            switch (repeatedKeyValue.getFieldCount()) {
              case 2:
                // if there are 2 fields, both key and value are projected
                keyResult = visitField(repeatedKeyValue.getType(0), visitor);
                valueResult = visitField(repeatedKeyValue.getType(1), visitor);
              case 1:
                // if there is just one, use the name to determine what it is
                Type keyOrValue = repeatedKeyValue.getType(0);
                if (keyOrValue.getName().equalsIgnoreCase("key")) {
                  keyResult = visitField(keyOrValue, visitor);
                  // value result remains null
                } else {
                  valueResult = visitField(keyOrValue, visitor);
                  // key result remains null
                }
              default:
                // both results will remain null
            }

            return visitor.map(group, keyResult, valueResult);

          } finally {
            visitor.fieldNames.pop();
          }

        default:
      }
    }

    return visitor.struct(group, visitFields(group, visitor));
  }
}