Java Code Examples for org.apache.parquet.schema.GroupType#getFieldCount()

The following examples show how to use org.apache.parquet.schema.GroupType#getFieldCount() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataWritableGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public DataWritableGroupConverter(final GroupType selectedGroupType,
    final HiveGroupConverter parent, final int index, final GroupType containingGroupType) {
  this.parent = parent;
  this.index = index;
  final int totalFieldCount = containingGroupType.getFieldCount();
  final int selectedFieldCount = selectedGroupType.getFieldCount();

  currentArr = new Object[totalFieldCount];
  converters = new Converter[selectedFieldCount];

  List<Type> selectedFields = selectedGroupType.getFields();
  for (int i = 0; i < selectedFieldCount; i++) {
    Type subtype = selectedFields.get(i);
    if (containingGroupType.getFields().contains(subtype)) {
      converters[i] = getConverterFromDescription(subtype,
          containingGroupType.getFieldIndex(subtype.getName()), this);
    } else {
      throw new IllegalStateException("Group type [" + containingGroupType +
          "] does not contain requested field: " + subtype);
    }
  }
}
 
Example 2
Source File: GroupWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void writeGroup(Group group, GroupType type) {
  int fieldCount = type.getFieldCount();
  for (int field = 0; field < fieldCount; ++field) {
    int valueCount = group.getFieldRepetitionCount(field);
    if (valueCount > 0) {
      Type fieldType = type.getType(field);
      String fieldName = fieldType.getName();
      recordConsumer.startField(fieldName, field);
      for (int index = 0; index < valueCount; ++index) {
        if (fieldType.isPrimitive()) {
          group.writeValue(field, index, recordConsumer);
        } else {
          recordConsumer.startGroup();
          writeGroup(group.getGroup(field, index), fieldType.asGroupType());
          recordConsumer.endGroup();
        }
      }
      recordConsumer.endField(fieldName, field);
    }
  }
}
 
Example 3
Source File: AvroRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public AvroUnionConverter(ParentValueContainer parent, Type parquetSchema,
                          Schema avroSchema, GenericData model) {
  super(parent);
  GroupType parquetGroup = parquetSchema.asGroupType();
  this.memberConverters = new Converter[ parquetGroup.getFieldCount()];

  int parquetIndex = 0;
  for (int index = 0; index < avroSchema.getTypes().size(); index++) {
    Schema memberSchema = avroSchema.getTypes().get(index);
    if (!memberSchema.getType().equals(Schema.Type.NULL)) {
      Type memberType = parquetGroup.getType(parquetIndex);
      memberConverters[parquetIndex] = newConverter(memberSchema, memberType, model, new ParentValueContainer() {
        @Override
        public void add(Object value) {
          Preconditions.checkArgument(
              AvroUnionConverter.this.memberValue == null,
              "Union is resolving to more than one type");
          memberValue = value;
        }
      });
      parquetIndex++; // Note for nulls the parquetIndex id not increased
    }
  }
}
 
Example 4
Source File: AvroIndexedRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public AvroUnionConverter(ParentValueContainer parent, Type parquetSchema,
                          Schema avroSchema, GenericData model) {
  this.parent = parent;
  GroupType parquetGroup = parquetSchema.asGroupType();
  this.memberConverters = new Converter[ parquetGroup.getFieldCount()];

  int parquetIndex = 0;
  for (int index = 0; index < avroSchema.getTypes().size(); index++) {
    Schema memberSchema = avroSchema.getTypes().get(index);
    if (!memberSchema.getType().equals(Schema.Type.NULL)) {
      Type memberType = parquetGroup.getType(parquetIndex);
      memberConverters[parquetIndex] = newConverter(memberSchema, memberType, model, new ParentValueContainer() {
        @Override
        public void add(Object value) {
          Preconditions.checkArgument(memberValue==null, "Union is resolving to more than one type");
          memberValue = value;
        }
      });
      parquetIndex++; // Note for nulls the parquetIndex id not increased
    }
  }
}
 
Example 5
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
MapConverter(List<TProtocol> parentEvents, GroupType parquetSchema, ThriftField field) {
  this.parentEvents = parentEvents;
  if (parquetSchema.getFieldCount() != 1) {
    throw new IllegalArgumentException("maps have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount());
  }
  Type nestedType = parquetSchema.getType(0);
  final ThriftField key = ((MapType)field.getType()).getKey();
  keyType = key.getType().getType().getThriftType();
  final ThriftField value = ((MapType)field.getType()).getValue();
  valueType = value.getType().getType().getThriftType();
  child = new GroupCounter(new MapKeyValueConverter(mapEvents, nestedType, key, value));
}
 
Example 6
Source File: PigSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Type filterBag(GroupType bagType, FieldSchema bagFieldSchema) throws FrontendException {
  if (LOG.isDebugEnabled()) LOG.debug("filtering BAG schema:\n" + bagType + "\nwith:\n " + bagFieldSchema);
  if (bagType.getFieldCount() != 1) {
    throw new RuntimeException("not unwrapping the right type, this should be a Bag: " + bagType);
  }
  Type nested = bagType.getType(0);
  FieldSchema innerField = bagFieldSchema.schema.getField(0);
  if (nested.isPrimitive() || nested.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation
    || nested.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) {
    // Bags always contain tuples => we skip the extra tuple that was inserted in that case.
    innerField = innerField.schema.getField(0);
  }
  return bagType.withNewFields(filter(nested, innerField));
}
 
Example 7
Source File: SimpleRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public SimpleRecordConverter(GroupType schema, String name, SimpleRecordConverter parent) {
  this.converters = new Converter[schema.getFieldCount()];
  this.parent = parent;
  this.name = name;

  int i = 0;
  for (Type field: schema.getFields()) {
    converters[i++] = createConverter(field);
  }
}
 
Example 8
Source File: DataWritableWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void writeArray(final ArrayWritable array, final GroupType type) {
  if (array == null) {
    return;
  }
  final Writable[] subValues = array.get();
  final int fieldCount = type.getFieldCount();
  for (int field = 0; field < fieldCount; ++field) {
    final Type subType = type.getType(field);
    recordConsumer.startField(subType.getName(), field);
    for (int i = 0; i < subValues.length; ++i) {
      final Writable subValue = subValues[i];
      if (subValue != null) {
        if (subType.isPrimitive()) {
          if (subValue instanceof ArrayWritable) {
            writePrimitive(((ArrayWritable) subValue).get()[field]);// 0 ?
          } else {
            writePrimitive(subValue);
          }
        } else {
          if (!(subValue instanceof ArrayWritable)) {
            throw new RuntimeException("This should be a ArrayWritable: " + subValue);
          } else {
            recordConsumer.startGroup();
            writeData((ArrayWritable) subValue, subType.asGroupType());
            recordConsumer.endGroup();
          }
        }
      }
    }
    recordConsumer.endField(subType.getName(), field);
  }
}
 
Example 9
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public ParquetGroup(GroupType schema) {
  this.schema = schema;
  this.data = new List[schema.getFields().size()];

  for (int i = 0; i < schema.getFieldCount(); ++i) {
    this.data[i] = new ArrayList();
  }
}
 
Example 10
Source File: RowConverter.java    From flink with Apache License 2.0 5 votes vote down vote up
public RowConverter(GroupType schema, TypeInformation<?> typeInfo, ParentDataHolder parent, int pos) {
	this.typeInfo = typeInfo;
	this.parentDataHolder = parent;
	this.posInParentRow = pos;
	this.converters = new Converter[schema.getFieldCount()];

	int i = 0;
	if (typeInfo.getArity() >= 1 && (typeInfo instanceof CompositeType)) {
		for (Type field : schema.getFields()) {
			converters[i] = createConverter(field, i, ((CompositeType<?>) typeInfo).getTypeAt(i), this);
			i++;
		}
	}
}
 
Example 11
Source File: PigSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public List<Type> filterTupleSchema(GroupType schemaToFilter, Schema pigSchema, RequiredFieldList requiredFieldsList) {
  List<Type> newFields = new ArrayList<Type>();
  List<Pair<FieldSchema,Integer>> indexedFields = new ArrayList<Pair<FieldSchema,Integer>>();

  try {
    if(requiredFieldsList == null) {
      int index = 0;
      for(FieldSchema fs : pigSchema.getFields()) {
        indexedFields.add(new Pair<FieldSchema, Integer>(fs, index++));
      }
    } else {
      for(RequiredField rf : requiredFieldsList.getFields()) {
        indexedFields.add(new Pair<FieldSchema, Integer>(pigSchema.getField(rf.getAlias()), rf.getIndex()));
      }
    }

    for (Pair<FieldSchema, Integer> p : indexedFields) {
      FieldSchema fieldSchema = pigSchema.getField(p.first.alias);
      if (p.second < schemaToFilter.getFieldCount()) {
        Type type = schemaToFilter.getFields().get(p.second);
        newFields.add(filter(type, fieldSchema));
      }
    }
  } catch (FrontendException e) {
      throw new RuntimeException("Failed to filter requested fields", e);
  }
  return newFields;
}
 
Example 12
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private StructConverter(List<TProtocol> events, GroupType parquetSchema, ThriftField field) {
  this.events = events;
  this.name = field.getName();
  this.tStruct = new TStruct(name);
  this.thriftType = (StructType)field.getType();
  this.schemaSize = parquetSchema.getFieldCount();
  this.converters = new Converter[this.schemaSize];
  List<ThriftField> thriftChildren = thriftType.getChildren();
  for (int i = 0; i < schemaSize; i++) {
    Type schemaType = parquetSchema.getType(i);
    String fieldName = schemaType.getName();
    ThriftField matchingThrift = null;
    for (ThriftField childField: thriftChildren) {
      String thriftChildName = childField.getName();
      if (thriftChildName != null && thriftChildName.equalsIgnoreCase(fieldName)) {
        matchingThrift = childField;
        break;
      }
    }
    if (matchingThrift == null) {
    	// this means the file did not contain that field
      // it will never be populated in this instance
      // other files might populate it
    	continue;
    }
    if (schemaType.isPrimitive()) {
    	converters[i] = new PrimitiveFieldHandler(newConverter(events, schemaType, matchingThrift).asPrimitiveConverter(), matchingThrift, events);
    } else {
    	converters[i] = new GroupFieldhandler(newConverter(events, schemaType, matchingThrift).asGroupConverter(), matchingThrift, events);
    }
  }
}
 
Example 13
Source File: ParquetTypeVisitor.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static <T> T visitList(GroupType list, ParquetTypeVisitor<T> visitor) {
  Preconditions.checkArgument(!list.isRepetition(Type.Repetition.REPEATED),
      "Invalid list: top-level group is repeated: %s", list);
  Preconditions.checkArgument(list.getFieldCount() == 1,
      "Invalid list: does not contain single repeated field: %s", list);

  GroupType repeatedElement = list.getFields().get(0).asGroupType();
  Preconditions.checkArgument(repeatedElement.isRepetition(Type.Repetition.REPEATED),
      "Invalid list: inner group is not repeated");
  Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1,
      "Invalid list: repeated group is not a single field: %s", list);

  visitor.beforeRepeatedElement(repeatedElement);
  try {
    T elementResult = null;
    if (repeatedElement.getFieldCount() > 0) {
      Type elementField = repeatedElement.getType(0);
      visitor.beforeElementField(elementField);
      try {
        elementResult = visit(elementField, visitor);
      } finally {
        visitor.afterElementField(elementField);
      }
    }

    return visitor.list(list, elementResult);

  } finally {
    visitor.afterRepeatedElement(repeatedElement);
  }
}
 
Example 14
Source File: RowConverter.java    From flink with Apache License 2.0 5 votes vote down vote up
public RowConverter(GroupType schema, TypeInformation<?> typeInfo, ParentDataHolder parent, int pos) {
	this.typeInfo = typeInfo;
	this.parentDataHolder = parent;
	this.posInParentRow = pos;
	this.converters = new Converter[schema.getFieldCount()];

	int i = 0;
	if (typeInfo.getArity() >= 1 && (typeInfo instanceof CompositeType)) {
		for (Type field : schema.getFields()) {
			converters[i] = createConverter(field, i, ((CompositeType<?>) typeInfo).getTypeAt(i), this);
			i++;
		}
	}
}
 
Example 15
Source File: LogicalListL2Converter.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private boolean isSupportedSchema(GroupType schema) {
  return schema.getFieldCount() == 1;
}
 
Example 16
Source File: ParquetAsJsonInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
private void groupToJson(JsonGenerator currentGenerator, SimpleGroup grp)
      throws IOException {

    GroupType gt = grp.getType();

    currentGenerator.writeStartObject();
    for(int i = 0; i < gt.getFieldCount(); i ++) {

        String field = gt.getFieldName(i);
        try {
            Type t = gt.getType(i);
            int repetition = 1;
            boolean repeated = false;
            if (t.getRepetition() == Type.Repetition.REPEATED) {
                repeated = true;
                repetition = grp.getFieldRepetitionCount(i);
                currentGenerator.writeArrayFieldStart(field);
            }
            else
                currentGenerator.writeFieldName(field);

            for(int j = 0; j < repetition; j ++) {

                if (t.isPrimitive()) {
                    switch (t.asPrimitiveType().getPrimitiveTypeName()) {
                        case BINARY:
                            currentGenerator.writeString(grp.getString(i, j));
                            break;
                        case INT32:
                            currentGenerator.writeNumber(grp.getInteger(i, j));
                            break;
                        case INT96:
                        case INT64:
                            // clumsy way - TODO - Subclass SimpleGroup or something like that
                            currentGenerator.writeNumber(Long.parseLong(grp.getValueToString(i, j)));
                            break;
                        case DOUBLE:
                        case FLOAT:
                            currentGenerator.writeNumber(Double.parseDouble(grp.getValueToString(i, j)));
                            break;
                        case BOOLEAN:
                            currentGenerator.writeBoolean(grp.getBoolean(i, j));
                            break;
                        default:
                            throw new RuntimeException("Can't handle type " + gt.getType(i));
                    }
                } else {
                    groupToJson(currentGenerator, (SimpleGroup) grp.getGroup(i, j));
                }
            }

            if (repeated)
                currentGenerator.writeEndArray();
        }
        catch (Exception e) {
            if (e.getMessage().startsWith("not found") && gt.getType(i).getRepetition() == Type.Repetition.OPTIONAL)
                currentGenerator.writeNull();
            else
                 throw new RuntimeException(e);
        }
    }
    currentGenerator.writeEndObject();
}
 
Example 17
Source File: ParquetTypeVisitor.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private static <T> T visitMap(GroupType map, ParquetTypeVisitor<T> visitor) {
  Preconditions.checkArgument(!map.isRepetition(Type.Repetition.REPEATED),
      "Invalid map: top-level group is repeated: %s", map);
  Preconditions.checkArgument(map.getFieldCount() == 1,
      "Invalid map: does not contain single repeated field: %s", map);

  GroupType repeatedKeyValue = map.getType(0).asGroupType();
  Preconditions.checkArgument(repeatedKeyValue.isRepetition(Type.Repetition.REPEATED),
      "Invalid map: inner group is not repeated");
  Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2,
      "Invalid map: repeated group does not have 2 fields");

  visitor.beforeRepeatedKeyValue(repeatedKeyValue);
  try {
    T keyResult = null;
    T valueResult = null;
    switch (repeatedKeyValue.getFieldCount()) {
      case 2:
        // if there are 2 fields, both key and value are projected
        Type keyType = repeatedKeyValue.getType(0);
        visitor.beforeKeyField(keyType);
        try {
          keyResult = visit(keyType, visitor);
        } finally {
          visitor.afterKeyField(keyType);
        }
        Type valueType = repeatedKeyValue.getType(1);
        visitor.beforeValueField(valueType);
        try {
          valueResult = visit(valueType, visitor);
        } finally {
          visitor.afterValueField(valueType);
        }
        break;

      case 1:
        // if there is just one, use the name to determine what it is
        Type keyOrValue = repeatedKeyValue.getType(0);
        if (keyOrValue.getName().equalsIgnoreCase("key")) {
          visitor.beforeKeyField(keyOrValue);
          try {
            keyResult = visit(keyOrValue, visitor);
          } finally {
            visitor.afterKeyField(keyOrValue);
          }
          // value result remains null
        } else {
          visitor.beforeValueField(keyOrValue);
          try {
            valueResult = visit(keyOrValue, visitor);
          } finally {
            visitor.afterValueField(keyOrValue);
          }
          // key result remains null
        }
        break;

      default:
        // both results will remain null
    }

    return visitor.map(map, keyResult, valueResult);

  } finally {
    visitor.afterRepeatedKeyValue(repeatedKeyValue);
  }
}
 
Example 18
Source File: ParquetTypeVisitor.java    From presto with Apache License 2.0 4 votes vote down vote up
public static <T> T visit(Type type, ParquetTypeVisitor<T> visitor)
{
    if (type instanceof MessageType) {
        return visitor.message((MessageType) type, visitFields(type.asGroupType(), visitor));
    }
    else if (type.isPrimitive()) {
        return visitor.primitive(type.asPrimitiveType());
    }
    else {
        // if not a primitive, the typeId must be a group
        GroupType group = type.asGroupType();
        OriginalType annotation = group.getOriginalType();
        if (annotation == LIST) {
            checkArgument(!group.isRepetition(REPEATED),
                    "Invalid list: top-level group is repeated: " + group);
            checkArgument(group.getFieldCount() == 1,
                    "Invalid list: does not contain single repeated field: " + group);

            GroupType repeatedElement = group.getFields().get(0).asGroupType();
            checkArgument(repeatedElement.isRepetition(REPEATED),
                    "Invalid list: inner group is not repeated");
            checkArgument(repeatedElement.getFieldCount() <= 1,
                    "Invalid list: repeated group is not a single field: " + group);

            visitor.fieldNames.push(repeatedElement.getName());
            try {
                T elementResult = null;
                if (repeatedElement.getFieldCount() > 0) {
                    elementResult = visitField(repeatedElement.getType(0), visitor);
                }

                return visitor.list(group, elementResult);
            }
            finally {
                visitor.fieldNames.pop();
            }
        }
        else if (annotation == MAP) {
            checkArgument(!group.isRepetition(REPEATED),
                    "Invalid map: top-level group is repeated: " + group);
            checkArgument(group.getFieldCount() == 1,
                    "Invalid map: does not contain single repeated field: " + group);

            GroupType repeatedKeyValue = group.getType(0).asGroupType();
            checkArgument(repeatedKeyValue.isRepetition(REPEATED),
                    "Invalid map: inner group is not repeated");
            checkArgument(repeatedKeyValue.getFieldCount() <= 2,
                    "Invalid map: repeated group does not have 2 fields");

            visitor.fieldNames.push(repeatedKeyValue.getName());
            try {
                T keyResult = null;
                T valueResult = null;
                if (repeatedKeyValue.getFieldCount() == 2) {
                    keyResult = visitField(repeatedKeyValue.getType(0), visitor);
                    valueResult = visitField(repeatedKeyValue.getType(1), visitor);
                }
                else if (repeatedKeyValue.getFieldCount() == 1) {
                    Type keyOrValue = repeatedKeyValue.getType(0);
                    if (keyOrValue.getName().equalsIgnoreCase("key")) {
                        keyResult = visitField(keyOrValue, visitor);
                        // value result remains null
                    }
                    else {
                        valueResult = visitField(keyOrValue, visitor);
                        // key result remains null
                    }
                }
                return visitor.map(group, keyResult, valueResult);
            }
            finally {
                visitor.fieldNames.pop();
            }
        }
        return visitor.struct(group, visitFields(group, visitor));
    }
}
 
Example 19
Source File: DataWritableGroupConverter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema) {
  this(requestedSchema, null, 0, tableSchema);
  final int fieldCount = tableSchema.getFieldCount();
  this.rootMap = new Writable[fieldCount];
}
 
Example 20
Source File: LogicalListL1Converter.java    From dremio-oss with Apache License 2.0 3 votes vote down vote up
/**
 * Checks if the schema is similar to the following:
 * <pre>
 * optional group <name> (LIST) {
 *   repeated group <list-name> {
 *     <element-repetition> <element-type> <element-name>;
 *   }
 * }
 * </pre>
 *
 * @param schema parquet group type
 * @return true is supported
 */
public static boolean isSupportedSchema(GroupType schema) {
  if (schema.getFieldCount() == 1) {
    Type type = schema.getType(0);
    // check: repeated group
    if (type.isPrimitive() || !type.isRepetition(REPEATED) || type.getOriginalType() != null) {
      return false;
    }
    return type.asGroupType().getFieldCount() == 1;
  }
  return false;
}