Java Code Examples for org.apache.parquet.schema.Type#getRepetition()

The following examples show how to use org.apache.parquet.schema.Type#getRepetition() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Changes the list inner '$data$' vector name to 'element' in the schema
 */
private Type renameChildTypeToElement(Type childType) {
  if (childType.isPrimitive()) {
    PrimitiveType childPrimitiveType = childType.asPrimitiveType();
    return new PrimitiveType(childType.getRepetition(),
      childPrimitiveType.getPrimitiveTypeName(),
      childPrimitiveType.getTypeLength(),
      "element",
      childPrimitiveType.getOriginalType(),
      childPrimitiveType.getDecimalMetadata(),
      childPrimitiveType.getId());
  } else {
    GroupType childGroupType = childType.asGroupType();
    Type.ID id = childGroupType.getId();
    GroupType groupType = new GroupType(childType.getRepetition(),
      "element",
      childType.getOriginalType(),
      childGroupType.getFields());
    if (id != null) {
      groupType = groupType.withId(id.hashCode());
    }
    return groupType;
  }
}
 
Example 2
Source File: ValidatingRecordConsumer.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void validate(PrimitiveTypeName p) {
  Type currentType = types.peek().asGroupType().getType(fields.peek());
  int c = fieldValueCount.pop() + 1;
  fieldValueCount.push(c);
  LOG.debug("validate {} for {}",p ,currentType.getName());
  switch (currentType.getRepetition()) {
    case OPTIONAL:
    case REQUIRED:
      if (c > 1) {
        throw new InvalidRecordException("repeated value when the type is not repeated in " + currentType);
      }
      break;
    case REPEATED:
      break;
    default:
      throw new InvalidRecordException("unknown repetition " + currentType.getRepetition() + " in " + currentType);
  }
  if (!currentType.isPrimitive() || currentType.asPrimitiveType().getPrimitiveTypeName() != p) {
    throw new InvalidRecordException("expected type " + p + " but got "+ currentType);
  }
}
 
Example 3
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private boolean hasMissingRequiredFieldInGroupType(GroupType requested, GroupType fullSchema) {
  for (Type field : fullSchema.getFields()) {

    if (requested.containsField(field.getName())) {
      Type requestedType = requested.getType(field.getName());
      // if a field is in requested schema and the type of it is a group type, then do recursive check
      if (!field.isPrimitive()) {
        if (hasMissingRequiredFieldInGroupType(requestedType.asGroupType(), field.asGroupType())) {
          return true;
        } else {
          continue;// check next field
        }
      }
    } else {
      if (field.getRepetition() == Type.Repetition.REQUIRED) {
        return true; // if a field is missing in requested schema and it's required
      } else {
        continue; // the missing field is not required, then continue checking next field
      }
    }
  }

  return false;
}
 
Example 4
Source File: DrillParquetReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
private static Type getType(String[] pathSegments, int depth, MessageType schema) {
  Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1));
  if (depth + 1 == pathSegments.length) {
    return type;
  } else {
    Preconditions.checkState(!type.isPrimitive());
    return new GroupType(type.getRepetition(), type.getName(), getType(pathSegments, depth + 1, schema));
  }
}
 
Example 5
Source File: ParquetResolver.java    From pxf with Apache License 2.0 5 votes vote down vote up
private OneField resolvePrimitive(Group group, int columnIndex, Type type, int level) {

        OneField field = new OneField();
        // get type converter based on the primitive type
        ParquetTypeConverter converter = ParquetTypeConverter.from(type.asPrimitiveType());

        // determine how many values for the primitive are present in the column
        int repetitionCount = group.getFieldRepetitionCount(columnIndex);

        // at the top level (top field), non-repeated primitives will convert to typed OneField
        if (level == 0 && type.getRepetition() != REPEATED) {
            field.type = converter.getDataType(type).getOID();
            field.val = repetitionCount == 0 ? null : converter.getValue(group, columnIndex, 0, type);
        } else if (type.getRepetition() == REPEATED) {
            // repeated primitive at any level will convert into JSON
            ArrayNode jsonArray = mapper.createArrayNode();
            for (int repeatIndex = 0; repeatIndex < repetitionCount; repeatIndex++) {
                converter.addValueToJsonArray(group, columnIndex, repeatIndex, type, jsonArray);
            }
            // but will become a string only at top level
            if (level == 0) {
                field.type = DataType.TEXT.getOID();
                try {
                    field.val = mapper.writeValueAsString(jsonArray);
                } catch (Exception e) {
                    throw new RuntimeException("Failed to serialize repeated parquet type " + type.asPrimitiveType().getName(), e);
                }
            } else {
                // just return the array node within OneField container
                field.val = jsonArray;
            }
        } else {
            // level > 0 and type != REPEATED -- primitive type as a member of complex group -- NOT YET SUPPORTED
            throw new UnsupportedOperationException("Parquet complex type support is not yet available.");
        }
        return field;
    }
 
Example 6
Source File: ParquetRowiseReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static Type getType(String[] pathSegments, int depth, MessageType schema) {
  Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1));
  if (depth + 1 == pathSegments.length) {
    return type;
  } else {
    Preconditions.checkState(!type.isPrimitive());
    return new GroupType(type.getRepetition(), type.getName(), type.getOriginalType(), getType(pathSegments, depth + 1, schema));
  }
}
 
Example 7
Source File: ValidatingRecordConsumer.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validate(PrimitiveTypeName... ptypes) {
  Type currentType = types.peek().asGroupType().getType(fields.peek());
  int c = fieldValueCount.pop() + 1;
  fieldValueCount.push(c);
  if (LOG.isDebugEnabled()) LOG.debug("validate " + Arrays.toString(ptypes) + " for " + currentType.getName());
  switch (currentType.getRepetition()) {
    case OPTIONAL:
    case REQUIRED:
      if (c > 1) {
        throw new InvalidRecordException("repeated value when the type is not repeated in " + currentType);
      }
      break;
    case REPEATED:
      break;
    default:
      throw new InvalidRecordException("unknown repetition " + currentType.getRepetition() + " in " + currentType);
  }
  if (!currentType.isPrimitive()) {
    throw new InvalidRecordException(
        "expected type in " + Arrays.toString(ptypes) + " but got " + currentType);
  }
  for (PrimitiveTypeName p : ptypes) {
    if (currentType.asPrimitiveType().getPrimitiveTypeName() == p) {
      return; // type is valid
    }
  }
  throw new InvalidRecordException(
      "expected type in " + Arrays.toString(ptypes) + " but got " + currentType);
}
 
Example 8
Source File: ParquetAsTextInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 5 votes vote down vote up
protected List<String> groupToStrings(SimpleGroup grp) {

            ArrayList<String> s = new ArrayList<>();

            for (int n = 0; n < grp.getType().getFieldCount(); n ++) {

                Type field = grp.getType().getType(n);
                    try {
                        if (!field.isPrimitive())
                           s.addAll(groupToStrings((SimpleGroup) grp.getGroup(n, 0))); // array of groups not (yet) supported
                        else if (field.getRepetition() == Type.Repetition.REPEATED) {

                            boolean is_binary =
                                field.asPrimitiveType().getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY;
                            StringBuilder sb = new StringBuilder("[");
                            ArrayList<String> arr = new ArrayList<>();
                            for (int i = 0; i < grp.getFieldRepetitionCount(n); i ++)
                                arr.add(is_binary ? "\"" + grp.getValueToString(n, i) + "\"" :
                                    grp.getValueToString(n, i));

                            sb.append(Joiner.on(", ").join(arr));
                            sb.append("]");
                            s.add(sb.toString());
                        }
                        else
                            s.add(grp.getValueToString(n, 0));
                    }
                    catch (RuntimeException e) {
                        if(e.getMessage().startsWith("not found") && field.getRepetition() == Type.Repetition.OPTIONAL)
                            s.add("");
                        else
                            throw e;
                    }
            }

            return s;
        }
 
Example 9
Source File: ParquetAsJsonInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
private void groupToJson(JsonGenerator currentGenerator, SimpleGroup grp)
      throws IOException {

    GroupType gt = grp.getType();

    currentGenerator.writeStartObject();
    for(int i = 0; i < gt.getFieldCount(); i ++) {

        String field = gt.getFieldName(i);
        try {
            Type t = gt.getType(i);
            int repetition = 1;
            boolean repeated = false;
            if (t.getRepetition() == Type.Repetition.REPEATED) {
                repeated = true;
                repetition = grp.getFieldRepetitionCount(i);
                currentGenerator.writeArrayFieldStart(field);
            }
            else
                currentGenerator.writeFieldName(field);

            for(int j = 0; j < repetition; j ++) {

                if (t.isPrimitive()) {
                    switch (t.asPrimitiveType().getPrimitiveTypeName()) {
                        case BINARY:
                            currentGenerator.writeString(grp.getString(i, j));
                            break;
                        case INT32:
                            currentGenerator.writeNumber(grp.getInteger(i, j));
                            break;
                        case INT96:
                        case INT64:
                            // clumsy way - TODO - Subclass SimpleGroup or something like that
                            currentGenerator.writeNumber(Long.parseLong(grp.getValueToString(i, j)));
                            break;
                        case DOUBLE:
                        case FLOAT:
                            currentGenerator.writeNumber(Double.parseDouble(grp.getValueToString(i, j)));
                            break;
                        case BOOLEAN:
                            currentGenerator.writeBoolean(grp.getBoolean(i, j));
                            break;
                        default:
                            throw new RuntimeException("Can't handle type " + gt.getType(i));
                    }
                } else {
                    groupToJson(currentGenerator, (SimpleGroup) grp.getGroup(i, j));
                }
            }

            if (repeated)
                currentGenerator.writeEndArray();
        }
        catch (Exception e) {
            if (e.getMessage().startsWith("not found") && gt.getType(i).getRepetition() == Type.Repetition.OPTIONAL)
                currentGenerator.writeNull();
            else
                 throw new RuntimeException(e);
        }
    }
    currentGenerator.writeEndObject();
}